#======================================================================= __version__ = '''0.0.01''' __sub_version__ = '''20120309173155''' __copyright__ = '''(c) Alex A. Naanou 2011''' #----------------------------------------------------------------------- # The first index.py might be a little too complicated. try and resolve # this as so: # - list all relevant files (RAW, XMP, JPG, PSD, ...) # - group by path (closeness) # - deepest common path to contain all files with common name. # this will fail if we have different files with same names. # #----------------------------------------------------------------------- import os import json import zipfile import uuid import time import pyexiv2 as metadata from itertools import izip, izip_longest from pli.logictypes import ANY, OR from pprint import pprint #----------------------------------------------------------------------- CONFIG_NAME = 'test_config.json' ##CONFIG_NAME = 'tmp_config.json' config = json.load(open(CONFIG_NAME)) # XXX move this to a context-dependant module... RAW = OR( 'NEF', 'nef', 'CRW', 'crw', 'CR2', 'cr2', 'X3F', 'x3f' ) JPEG = OR( 'JPG', 'jpg', 'JPEG', 'jpeg' ) PSD = OR( 'PSD', 'psd' ) TIFF = OR( 'TIFF', 'tiff', 'TIF', 'tif' ) XMP = OR( 'XMP', 'xmp' ) ITEM = OR(RAW, JPEG, PSD, TIFF, XMP) TYPES = { 'raw': RAW, 'jpeg': JPEG, 'psd': PSD, 'tiff': TIFF, 'xmp': XMP, } SUBTREE_CLASSES = { 'preview': 'preview', 'preview (RAW)': 'RAW preview', } #----------------------------------------------------------------------- # XXX need a strategy to check if two files that have the same GID are # identical, and if so, need to destinguish them in the GID... # might be a good idea to add a file hash # XXX not yet sure if this is unique enough to avoid conflicts if one # photographer has enough cameras... # XXX also might be wise to add a photographer ID into here... def image_gid(path, format='%(artist)s-%(date)s-%(name)s', date_format='%Y%m%d-%H%M%S'): ''' Calgulate image GID. Main gid criteria: - unique - calculable from the item (preferably any sub-item) - human-readable Default format: -- Example: Alex_A.Naanou-20110627-195706-DSC_1234 Supported fields: %(artist)s - Exif.Image.Artist field, stripped and spaces replaced with underscores. %(date)s - Exif.Image.DateTime formated to date_format argument. %(name)s - file name. NOTE: date and time are the date and time the image was made ('Exif.Image.DateTime') NOTE: need EXIF data to generate a GID ''' # get the filename... data = { 'name': os.path.splitext(os.path.split(path)[-1])[0], } # check if we need a date in the id... if '%(date)s' in format: i = metadata.ImageMetadata('%s' % path) i.read() d = i['Exif.Image.DateTime'].value data['date'] = d.strftime(date_format) # check if we need an artist... if '%(artist)s' in format: data['artist'] = i['Exif.Image.Artist'].value.strip().replace(' ', '_') return format % data ##!!! we will need to normalize the paths to one single scheme (either relative or absolute)... # XXX might need to fetch file data too... def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM, include_root_path=False, include_ctime=True): ''' yields: (, , [, ]), ''' for orig_path, dirs, files in os.walk(root): # XXX is this correct... path = orig_path.split(os.path.sep) # remove root from path... if not include_root_path: path = path[len(root.split(os.path.sep)):] # process files... for f in files: name, ext = os.path.splitext(f) # we need the extension wothout the dot... ext = ext[1:] # filter by ext... if ext == type: if include_ctime: t = os.path.getctime(os.path.join(orig_path, f)) yield path, name, ext, t else: yield path, name, ext def common_len(a, *b): ''' ''' for i, l in enumerate(izip(*(a,) + b)): if len(set(l)) != 1: return i return len(min(*(a,) + b)) ##!!! is this meaningless? def path_distance(a, b): ''' ''' return len(a) + len(b) - common_len(a, b)*2 #----------------------------------------------------------------------- if __name__ == '__main__': FILE_LIST = os.path.join('test', 'flatfilelist.json') BUILD_FILE_LIST = False if os.path.exists(FILE_LIST) else True if BUILD_FILE_LIST: lst = list(list_files(config['ARCHIVE_ROOT'])) print len(lst) pprint(lst[0]) json.dump(lst, file(FILE_LIST, 'w')) lst = json.load(file(FILE_LIST)) print len(lst) # sort via name, ext, path lst.sort(key=lambda e: (e[1], e[2], e[0])) # index by name (indexing preparation)... # { # : [ # (, , ), # ... # ], # ... # } index = {} for p, n, t, c in lst: if n in index: index[n] += [(p, n, t, c)] else: index[n] = [(p, n, t, c)] # index via a propper GID... # split similarly named but different files... GID_index = {} failed = [] for name, l in index.items(): l.sort() raws = [e for e in l if e[2] == RAW] # handle multiple raw files... if len(raws) > 1: common = common_len(*[ e[0] for e in raws ]) # NOTE: do not change the order of raws after this point # and till the end of the loop... # XXX revise if there is a simpler way... ##!!! this kills code like sets[0][1] += [...] ## sets = [ (r, [r]) for r in raws ] sets = [ [r, [r]] for r in raws ] for e in l: if e[2] == RAW: continue # check if we are closer to other raws... # NOTE: this depends on stability of order in raws c_index = [(common_len(r[0], e[0]), r, i) for i, r in enumerate(raws)] c, raw, i = max(*c_index) # we have two locations with identical weight... if c_index.count([c, ANY, ANY]) > 1: # a file is at a path junction exactly... print ' !!! can\'t decide where to put %s.%s...' % (e[1], e[2]) ##!!! try different strategies here... ##!!! failed += [e] # found a location... elif c > common: # XXX hack (se below) ## s = sets[i][1] ## s += [e] ##!!! for some odd reason this does not work.... sets[i][1] += [e] # file in an odd location ##!!! list these locations... else: print ' !!! can\'t decide where to put %s.%s...' % (e[1], e[2]) ##!!! try different strategies here... ##!!! failed += [e] # single raw... elif len(raws) == 1: sets = [(raws[0], l)] # no raw files... else: print 'no raw file found for "%s"...' % os.path.join(name) sets = [] ##!!! need to report this in a usable way... failed += l for raw, l in sets: # get file GID... GID = image_gid('%s.%s' % (os.path.join(*[config['ARCHIVE_ROOT']] + raw[0] + [raw[1]]), raw[2])) GID_index[GID] = { 'gid': GID, 'name': name, 'imported': time.time(), # NOTE: this might get distorted on archiving or # copying... # mostly intended for importing... 'ctime': raw[3], 'RAW': raws, 'XMP': [e for e in l if e[2] == XMP], 'JPG': [e for e in l if e[2] == JPEG], 'PSD': [e for e in l if e[2] == PSD], 'TIFF': [e for e in l if e[2] == TIFF], 'other': [e for e in l if e[2] != OR(TIFF, PSD, JPEG, XMP, RAW)], } ##!!! TODO: archive descriptions to help index/tag items... # NOTE: each import from an existing archive will be as follows: # - full listing # - find new subtrees # - find modified items (file date diff) # NOTE: raws number here may be more than indexed because some raws # may get grouped by GID print '''results: indexed: %s raws: %s failed: %s ''' % (len(GID_index), len([ e for e in lst if e[2] == RAW]), len(failed)) pprint(GID_index.values()[0]) #======================================================================= # vim:set ts=4 sw=4 nowrap :