#======================================================================= __version__ = '''0.0.01''' __sub_version__ = '''20120302014841''' __copyright__ = '''(c) Alex A. Naanou 2011''' #----------------------------------------------------------------------- # The first index.py might be a little too complicated. try and resolve # this as so: # - list all relevant files (RAW, XMP, JPG, PSD, ...) # - group by path (closeness) # - deepest common path to contain all files with common name. # this will fail if we have different files with same names. # #----------------------------------------------------------------------- import os import json import zipfile import uuid import time from itertools import izip, izip_longest from pli.logictypes import ANY, OR from pprint import pprint #----------------------------------------------------------------------- CONFIG_NAME = 'test_config.json' ##CONFIG_NAME = 'tmp_config.json' config = json.load(open(CONFIG_NAME)) RAW = OR( 'NEF', 'nef', 'CRW', 'crw', 'CR2', 'cr2', 'X3F', 'x3f' ) JPEG = OR( 'JPG', 'jpg', 'JPEG', 'jpeg' ) PSD = OR( 'PSD', 'psd' ) TIFF = OR( 'TIFF', 'tiff', 'TIF', 'tif' ) XMP = OR( 'XMP', 'xmp' ) ITEM = OR(RAW, JPEG, PSD, TIFF, XMP) TYPES = { 'raw': RAW, 'jpeg': JPEG, 'psd': PSD, 'tiff': TIFF, 'xmp': XMP, } SUBTREE_CLASSES = { 'preview': 'preview', 'preview (RAW)': 'RAW preview', } #----------------------------------------------------------------------- ##!!! we will need to normalize the paths to one single scheme (either relative or absolute)... # XXX might need to fetch file data too... def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM, include_root_path=False): ''' yields: (, , ), ''' for path, dirs, files in os.walk(root): # XXX is this correct... path = path.split(os.path.sep) # process files... for f in files: name, ext = os.path.splitext(f) # we need the extension wothout the dot... ext = ext[1:] # filter by ext... if ext == type: if not include_root_path: yield path[len(root.split(os.path.sep)):], name, ext else: yield path, name, ext #----------------------------------------------------------------------- if __name__ == '__main__': FILE_LIST = os.path.join('test', 'flatfilelist.json') BUILD_FILE_LIST = False if os.path.exists(FILE_LIST) else True if BUILD_FILE_LIST: lst = list(list_files(config['ARCHIVE_ROOT'])) print len(lst) pprint(lst[0]) json.dump(lst, file(FILE_LIST), 'w') lst = json.load(file(FILE_LIST)) print len(lst) ## lst.sort() # sort via name, ext, path lst.sort(key=lambda e: (e[1], e[-1], e[0])) ##!!! duplicate a raw file... for p, n, t in lst: if t == RAW: lst += [(p, n, t)] break # index by name (indexing preparation)... # { # : [ # (, , ), # ... # ], # ... # } index = {} for p, n, t in lst: if n in index: index[n] += [(p, n, t)] else: index[n] = [(p, n, t)] # index via a propper GID... # split similarly named but different files... GID_index = {} for name, l in index.items(): l.sort() raws = [e for e in l if e[-1] == RAW] for raw in raws: if len(raws) > 1: print 'duplicates: %s (%sx)...' % (name, len(raws)), # split the group into c seporate groups... # strategies: # - path proximity (distance) # - metadata ##!!! print 'skipping.' break ##!!! gid construction should be a customizable function in itself... # main gid criteria: # - unique # - calculable from the item (preferably any sub-item) ## GID = '%s-%s' % (uuid.uuid4().hex, name) ##!!! get RAW file creation date from EXIF... ## GID = '%s-%s' % (hex(long(time.time()*1000))[2:-1].upper(), name) # GID should be human-readable... # XXX to avoid further ambiguity need to encode the camera # into file name, e.g. S01_1234 for SLR 01 and RO1_4321 for # rangefinder 01 and finally C01 for compact 01, etc. GID = '%s-%s' % (time.strftime('%Y%m%d-%H%M%S'), name) GID_index[GID] = { 'gid': GID, 'name': name, 'RAW': raws, 'XMP': [e for e in l if e[-1] == XMP], 'JPG': [e for e in l if e[-1] == JPEG], 'PSD': [e for e in l if e[-1] == PSD], 'TIFF': [e for e in l if e[-1] == TIFF], 'other': [e for e in l if e[-1] != OR(TIFF, PSD, JPEG, XMP, RAW)], } ##!!! TODO: archive descriptions to help index/tag items... # NOTE: each import from an existing archive will be as follows: # - full listing # - find new subtrees # - find modified items (file date diff) print GID print len(GID_index), len([ e for e in lst if e[-1] == RAW]) pprint(GID_index.values()[0]) #======================================================================= # vim:set ts=4 sw=4 nowrap :