From 0816077356f8c22bf5b10f7ff7412691b3b016a5 Mon Sep 17 00:00:00 2001 From: "Alex A. Naanou" Date: Thu, 10 Nov 2011 18:43:46 +0400 Subject: [PATCH] got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou --- index.py | 221 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 198 insertions(+), 23 deletions(-) diff --git a/index.py b/index.py index 0b8c7f3e..2e6daf34 100755 --- a/index.py +++ b/index.py @@ -1,7 +1,7 @@ #======================================================================= __version__ = '''0.0.01''' -__sub_version__ = '''20111103010916''' +__sub_version__ = '''20111110184147''' __copyright__ = '''(c) Alex A. Naanou 2011''' @@ -9,28 +9,55 @@ __copyright__ = '''(c) Alex A. Naanou 2011''' import os import json +import uuid -from pli.logictypes import OR +from itertools import izip, izip_longest + +from pli.logictypes import ANY, OR #----------------------------------------------------------------------- -CONFIG_NAME = 'config.json' +CONFIG_NAME = 'test_config.json' config = json.load(open(CONFIG_NAME)) -ITEM_EXTENSIONS = ( - # RAW formats... - 'NEF', 'nef', - # JPEGs... - 'JPG', 'JPEG', 'jpg', 'jpeg', - # Editid images... - 'PSD', 'psd', - 'TIFF', 'tiff', 'TIF', 'tif', - # metadata sidecar files... - 'XMP', 'xmp', +RAW = OR( + 'NEF', 'nef', + 'CRW', 'crw', + 'CR2', 'cr2', + 'X3F', 'x3f' ) +JPEG = OR( + 'JPG', 'jpg', + 'JPEG', 'jpeg' +) + +PSD = OR( + 'PSD', 'psd' +) + +TIFF = OR( + 'TIFF', 'tiff', + 'TIF', 'tif' +) + +XMP = OR( + 'XMP', 'xmp' +) + +ITEM = OR(RAW, JPEG, PSD, TIFF, XMP) + +TYPES = { + 'raw': RAW, + 'jpeg': JPEG, + 'psd': PSD, + 'tiff': TIFF, + 'xmp': XMP, +} + + SUBTREE_CLASSES = { 'preview': 'preview', 'preview (RAW)': 'RAW preview', @@ -39,22 +66,161 @@ SUBTREE_CLASSES = { #----------------------------------------------------------------------- -def list_files(root, sub_trees=SUBTREE_CLASSES, ext=OR(*ITEM_EXTENSIONS)): +##!!! we will need to normalize the paths to one single scheme (either relative or absolute)... +def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM): ''' ''' for path, dirs, files in os.walk(root): - # clasify by subtree... - p = os.path.split(path) - subtree_type = None - for t in sub_trees: - if t in p: - subtree_type = sub_trees[t] - break + path = path.split(os.path.sep) # process files... for f in files: + name, ext = os.path.splitext(f) + # we need the extension wothout the dot... + ext = ext[1:] # filter by ext... - if f.split('.')[-1] == ext: - yield subtree_type, path, f + if ext == type: + yield path, name, ext + + +# XXX need to split duplicate named raw files and corresponding +# previews... +def index_by_name(file_list, types=TYPES.items()): + ''' + format: + { + : { + : [ + , + ... + ], + ... + }, + ... + } + ''' + res = {} + for path, name, ext in file_list: + # normalize extension... + ext = types[types.index((ANY, ext))][0] + if name not in res: + # create a name... + res[name] = {} + if ext not in res[name]: + # create an extension... + res[name][ext] = [] + # general case... +## res[name][ext] += [(path, name, ext)] + res[name][ext] += [path] + return res + + +# for this to work correctly it must: +# - return unique paths +# - non of the returnd paths can be a strict subset of any other... +##!!! +def split_common(paths): + ''' + ''' + # pass 1: build list of common paths (None for all differences) + # NOTE: we may have stray common path elements but we do + # not care abut anything after a None... + index = izip_longest(*paths) + common = [] + for s in index: + next = [] + for i in s: + if s.count(i) > 1: + next += [i] + else: + next += [None] + common += [next] + # pass 2: cap each common section with a unique element... + common = [ list(e) for e in izip(*common)] + for c, p in izip(common, paths): + if None in c: + i = c.index(None) + if len(p) <= i: + # NOTE: this is the case when we have a None + # because a path just ended... i.e. there + # was no different element to split at... + # XXX do we need to break here? + # XXX one way to go here is to simply ignore + # such paths... + ##!!! XXX we will leave a None at the end of such paths for now... +## del c[i] + continue + # in-place update and truncate the common path... + c[i] = p[i] + del c[i+1:] + return common + +# in essance this need to replace image name with a GID and split up +# images that are identically named into seporate GIDs... +def split_images(index): + ''' + ''' + for name, data in index.items(): + # this will not let us lose the name of the image... + data['name'] = name + raw = data['raw'] + if len(raw) > 1: + # split the images... + # split images via closeness to one of the raw files... + # XXX the simple way to split files is to remove the + # common part of the path between two raw files and + # then split the other files by root of the + # subtree. + # this will not work in one case: + # - at least two of the raw files are in a deeper + # subtree than the other accompanying files. + # in this case wa can not use the topology to + # decide which is wich and need either to use + # some other means or to go inside the image... + # + # way to do this: + # - build a subtree map -- list of paths until the + # first unique directory + # - split files by subtree path + # - use a different strategy for files that are above + # the subtrees... + + common = split_common(raw) + + # prepare the return structure... + res = [] + for path in raw: + ##!!! + res += [{ + 'gid': uuid.uuid4(), + 'name': name, +## 'raw': [path], + }] + # start splitting the data... + for ext, paths in data.items(): + if ext not in TYPES: + continue + for path in paths: + matches = {} + for i, c in enumerate(common): + if path[:len(c)] == c: + matches[i] = len(c) + if len(matches) == 1: + i = matches.keys()[0] + # we found a location... + if ext not in res[i]: + res[i][ext] = [] + res[i][ext] += [path] + elif len(matches) > 1: + raise Exception, 'got %s matches.' % len(matches) + else: + # XXX ungrouped... + print '!!!!', path, name, ext + + # yield the results... + for e in res: + yield e['gid'], res + else: + yield uuid.uuid4(), data @@ -64,6 +230,15 @@ if __name__ == '__main__': print len(lst) + index = index_by_name(list_files(config['ARCHIVE_ROOT'])) + + print len(index) + + json.dump(index, file(os.path.join('test', 'filelist.json'), 'w')) + + index = list(split_images(index_by_name(list_files(config['ARCHIVE_ROOT'])))) + + print len(index)