From 3d41a07c7a595929412e2a5bf9063486e34209c6 Mon Sep 17 00:00:00 2001 From: "Alex A. Naanou" Date: Wed, 27 Mar 2013 18:13:52 +0400 Subject: [PATCH] lots of minor tweeks and changes... Signed-off-by: Alex A. Naanou --- gid.py | 11 ++++--- index2.py | 93 +++++++++++++++++++++++++++++++++++++++++++++---------- locate.py | 5 +-- store.py | 61 ++++++++++++++++++++++++++++++++++-- 4 files changed, 143 insertions(+), 27 deletions(-) diff --git a/gid.py b/gid.py index 694201e9..b864f054 100755 --- a/gid.py +++ b/gid.py @@ -1,7 +1,7 @@ #======================================================================= __version__ = '''0.0.01''' -__sub_version__ = '''20130322142905''' +__sub_version__ = '''20130325203750''' __copyright__ = '''(c) Alex A. Naanou 2011''' @@ -52,7 +52,8 @@ def image_gid(path, date=None, Supported fields: %(artist)s - Exif.Image.Artist field, stripped and spaces replaced with underscores. - %(date)s - Exif.Image.DateTime formated to date_format argument. + If no artist info is set this will be set to default_artist. + %(date)s - Exif.Photo.DateTimeOriginal formated to date_format argument. %(name)s - file name. NOTE: date and time are the date and time the image was made ('Exif.Image.DateTime') @@ -77,7 +78,6 @@ def image_gid(path, date=None, date = os.path.getctime(path) data['date'] = time.strftime(date_format, time.gmtime(date)) else: -## date = i['Exif.Image.DateTime'].value date = i['Exif.Photo.DateTimeOriginal'].value data['date'] = date.strftime(date_format) # check if we need an artist... @@ -85,7 +85,10 @@ def image_gid(path, date=None, data['artist'] = default_artist if i is not None: try: - data['artist'] = i['Exif.Image.Artist'].value.strip().replace(' ', '_') + # set the artist if in EXIF... + a = i['Exif.Image.Artist'].value.strip().replace(' ', '_') + if a != '': + data['artist'] = a except KeyError: pass diff --git a/index2.py b/index2.py index 01afbc2e..27ba89ce 100755 --- a/index2.py +++ b/index2.py @@ -1,7 +1,7 @@ #======================================================================= __version__ = '''0.0.01''' -__sub_version__ = '''20130319151025''' +__sub_version__ = '''20130326030151''' __copyright__ = '''(c) Alex A. Naanou 2011''' @@ -83,6 +83,9 @@ TYPES = { 'xmp': XMP, } +SKIP_DIRS = '.sys2' +SKIP_MARKER = '.skipindexing' + SUBTREE_CLASSES = { 'preview': 'preview', @@ -97,12 +100,22 @@ SUBTREE_CLASSES = { #----------------------------------------------------------list_files--- ##!!! we will need to normalize the paths to one single scheme (either relative or absolute)... # XXX might need to fetch file data too... -def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM, include_root_path=False, include_ctime=True): +def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM, + include_root_path=False, include_ctime=True, + skip_marker=SKIP_MARKER, skip_dirs=SKIP_DIRS): ''' yields: (, , [, ]), ''' for orig_path, dirs, files in os.walk(root): + # skip dir trees containing skip_filename... + if skip_marker in files: + del dirs[:] + continue + # skip dirs... + while skip_dirs in dirs: + dirs.remove(skip_dirs) + # XXX is this correct... path = orig_path.split(os.path.sep) # remove root from path... @@ -125,6 +138,7 @@ def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM, include_root_path=Fal #----------------------------------------------------------common_len--- def common_len(a, *b): ''' + calculate the common path length. ''' for i, l in enumerate(izip(*(a,) + b)): if len(set(l)) != 1: @@ -174,13 +188,12 @@ def split_by_raws(raws, lst, failed): ''' ''' ## raws = [e for e in lst if e[2] == RAW] + # top level common path... common = common_len(*[ e[0] for e in raws ]) # NOTE: do not change the order of raws after this point # and till the end of the loop... # XXX revise if there is a simpler way... - ##!!! this kills code like sets[0][1] += [...] -## sets = [ (r, [r]) for r in raws ] sets = [ [r, [r]] for r in raws ] for e in lst: @@ -199,7 +212,6 @@ def split_by_raws(raws, lst, failed): failed += [e] # found a location... elif c > common: - ##!!! for some odd reason this does not work.... sets[i][1] += [e] # file in an odd location ##!!! list these locations... else: @@ -207,14 +219,15 @@ def split_by_raws(raws, lst, failed): ##!!! try different strategies here... ##!!! failed += [e] -## return sets, failed return sets #-----------------------------------------------------------gid_index--- +##!!! this will rewrite existing data -- should only update... def gid_index(index, existing=None): ''' ''' + skipped = [] # index via a propper GID... # split similarly named but different files... if existing is None: @@ -222,34 +235,41 @@ def gid_index(index, existing=None): else: res = existing failed = [] + im_n = 0 + up_n = 0 + new_n = 0 + for name, l in index.iteritems(): l.sort() raws = [e for e in l if e[2] == RAW] # multiple raw files... if len(raws) > 1: - # spit this into a seporate func... sets = split_by_raws(raws, l, failed) # single raw... elif len(raws) == 1: sets = [(raws[0], l)] # no raw files... else: - print 'no raw file found for "%s"...' % os.path.join(name) + print (' '*78), '\rno raw file found for "%s"...' % os.path.join(name) sets = [] ##!!! need to report this in a usable way... failed += l # add actual elements to index... for raw, l in sets: + im_n += 1 + print 'Processing image:', im_n, 'new:', new_n, 'updated:', up_n, '\r', + # get file GID... GID = image_gid('%s.%s' % (os.path.join(*[config['ARCHIVE_ROOT']] + raw[0] + [raw[1]]), raw[2])) ##!!! normalize the image format... - res[GID] = { + img = { 'gid': GID, 'name': name, 'imported': time.time(), + 'updated': time.time(), # NOTE: this might get distorted on archiving or # copying... # mostly intended for importing... @@ -262,8 +282,30 @@ def gid_index(index, existing=None): 'TIFF': [e for e in l if e[2] == TIFF], 'other': [e for e in l if e[2] != OR(TIFF, PSD, JPEG, XMP, RAW)], } + # add new data... + if GID not in res: + res[GID] = img + new_n += 1 + # update existing... + else: + cur = res[GID] + updating = False + for k, v in img.iteritems(): + # skip + if k in ('imported', 'name', 'gid', 'ctime', 'updated'): + continue + if v != cur[k]: + cur[k] = v + updating = True + # do the actual update... + if updating: + cur['updated'] = time.time() + res[GID] = cur + up_n += 1 + else: + skipped += [GID] - return res, failed + return res, failed, skipped @@ -282,7 +324,7 @@ if __name__ == '__main__': lst = list(list_files(config['ARCHIVE_ROOT'])) print 'found files:', len(lst) - pprint(lst[0]) +## pprint(lst[0]) json.dump(lst, file(FILE_LIST, 'w')) print 'saved...' @@ -315,9 +357,26 @@ if __name__ == '__main__': ## GID_index = store.IndexWithCache(INDEX_PATH) GID_index = store.Index(INDEX_PATH) - ##!!! only check for updates... + # a cheating waw to say if we are empty... + index_empty = True + for k in GID_index.iterkeys(): + index_empty = False + break - GID_index, failed = gid_index(index, GID_index) + t0 = time.time() + + if not index_empty: + print 'updating...' + ##!!! this takes a substantially longer time initially... (about 30x longer) + GID_index, failed, skipped = gid_index(index, GID_index) + else: + print 'indexing...' + GID_index, failed, skipped = gid_index(index) + store.dump(GID_index, INDEX_PATH, index_depth=2) + + t1 = time.time() + + print 'done in:', t1-t0, 'seconds.' json.dump(failed, file(os.path.join('test', 'failed-to-categorise.json'), 'w')) @@ -336,13 +395,15 @@ if __name__ == '__main__': indexed: %s raws: %s failed: %s + skipped: %s ''' % ( len(GID_index), len([ e for e in lst if e[2] == RAW]), - len(failed)) + len(failed), + len(skipped)) - ##!!! this is really slow because it pulls ALL the data... wonder who wrote this? :) - pprint(GID_index.itervalues().next()) +## ##!!! this is really slow because it pulls ALL the data... wonder who wrote this? :) +## pprint(GID_index.itervalues().next()) ## store.dump(GID_index, INDEX_PATH) diff --git a/locate.py b/locate.py index 5ddd20db..19400aa4 100755 --- a/locate.py +++ b/locate.py @@ -1,7 +1,7 @@ #======================================================================= __version__ = '''0.0.01''' -__sub_version__ = '''20130322155314''' +__sub_version__ = '''20130325114759''' __copyright__ = '''(c) Alex A. Naanou 2011''' @@ -20,9 +20,6 @@ import store CONFIG_NAME = 'P7000_config.json' -#----------------------------------------------------------------------- - - #----------------------------------------------------------------------- if __name__ == '__main__': from optparse import OptionParser diff --git a/store.py b/store.py index be871a76..c06fc0ca 100755 --- a/store.py +++ b/store.py @@ -1,7 +1,7 @@ #======================================================================= __version__ = '''0.0.01''' -__sub_version__ = '''20130319150549''' +__sub_version__ = '''20130325170937''' __copyright__ = '''(c) Alex A. Naanou 2011''' @@ -10,6 +10,7 @@ __copyright__ = '''(c) Alex A. Naanou 2011''' import os import json import zipfile +import time import pli.pattern.mixin.mapping as mapping import pli.objutils as objutils @@ -67,6 +68,7 @@ def dump(index, path, index_depth=1, ext='.json'): #----------------------------------------------------------------load--- +##!!! make an iterator version... def load(path, ext='.json', pack_ext='.pack'): ''' load data from fs store. @@ -99,14 +101,23 @@ def load(path, ext='.json', pack_ext='.pack'): # only the last is accesible but this might cause trouble elsewhere... # NOTE: this should be done in the background (possible race-condition # with removing a file while it is being read) -def pack(path, ext='.json', pack_ext='.pack', keep_files=False, keep_dirs=False): +def pack(path, pack_name='%(timestamp)s', ext='.json', pack_ext='.pack', + keep_files=False, keep_dirs=False, date_format='%Y%m%d-%H%M%S'): ''' pack an fs data store. + Supported fields in pack_name: + %(timestamp)s - time stamp in the date_format format + NOTE: if keep_files is True, keep_dirs option will be ignored. + NOTE: if pack_name is static and a pack file with that name exists + then the files will be added to that pack. ''' + data = { + 'timestamp': time.strftime(date_format), + } ##!!! this will not remove original entries if they exist... - z = zipfile.ZipFile(os.path.join(path, 'index' + pack_ext), 'a', compression=zipfile.ZIP_DEFLATED) + z = zipfile.ZipFile(os.path.join(path, (pack_name % data) + pack_ext), 'a', compression=zipfile.ZIP_DEFLATED) for p, _, files in os.walk(path): for f in files: if f.endswith(ext): @@ -125,6 +136,24 @@ def pack(path, ext='.json', pack_ext='.pack', keep_files=False, keep_dirs=False) z.close() +#-----------------------------------------------------------cleanpack--- +def cleanpack(path, pack_name='%(timestamp)s', ext='.json', pack_ext='.pack', + keep_files=False, keep_dirs=False, date_format='%Y%m%d-%H%M%S'): + ''' + make a clean pack, removing duplicate enteries. + ''' + data = { + 'timestamp': time.strftime(date_format), + } + name = os.path.join(path, (pack_name % data) + pack_ext) + ##!!! this will load the whole monster to memory, need something better... + index = load(path) + z = zipfile.ZipFile(name, 'w', compression=zipfile.ZIP_DEFLATED) + for k, v in index.iteritems(): + z.writestr(k + ext, json.dumps(v, indent=4, separators=(', ', ': '))) + z.close() + + #----------------------------------------------------------------------- # lazy dict-like objects that read and write (optional) the fs... @@ -216,6 +245,16 @@ class Index(mapping.Mapping): yield os.path.splitext(name)[0] +#-------------------------------------------------------IndexWtihPack--- +class IndexWtihPack(object): + ''' + ''' + def pack(self): + ''' + pack the index. + ''' + pack(self._path) + #----------------------------------------------------------------------- REMOVED = object() @@ -286,6 +325,22 @@ class IndexWithCache(Index): del self._cache +#---------------------------------------------------IndexWithSubIndex--- +##class IndexWithSubIndex(Index): +## ''' +## ''' +## def indexby(self, attr): +## ''' +## ''' +## self._sub_indexs +## for e in self: +## pass +## def getby(self, attr, value): +## ''' +## ''' +## pass + + #======================================================================= # vim:set ts=4 sw=4 nowrap :