From 3d41a07c7a595929412e2a5bf9063486e34209c6 Mon Sep 17 00:00:00 2001
From: "Alex A. Naanou" <alex.nanou@gmail.com>
Date: Wed, 27 Mar 2013 18:13:52 +0400
Subject: [PATCH] lots of minor tweeks and changes...

Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com>
---
 gid.py    | 11 ++++---
 index2.py | 93 +++++++++++++++++++++++++++++++++++++++++++++----------
 locate.py |  5 +--
 store.py  | 61 ++++++++++++++++++++++++++++++++++--
 4 files changed, 143 insertions(+), 27 deletions(-)
diff --git a/gid.py b/gid.py
index 694201e9..b864f054 100755
--- a/gid.py
+++ b/gid.py
@@ -1,7 +1,7 @@
 #=======================================================================
 
 __version__ = '''0.0.01'''
-__sub_version__ = '''20130322142905'''
+__sub_version__ = '''20130325203750'''
 __copyright__ = '''(c) Alex A. Naanou 2011'''
 
 
@@ -52,7 +52,8 @@ def image_gid(path, date=None,
 	Supported fields:
 		%(artist)s	- Exif.Image.Artist field, stripped and spaces replaced
 					  with underscores.
-		%(date)s	- Exif.Image.DateTime formated to date_format argument.
+					  If no artist info is set this will be set to default_artist.
+		%(date)s	- Exif.Photo.DateTimeOriginal formated to date_format argument.
 		%(name)s	- file name.
 
 	NOTE: date and time are the date and time the image was made ('Exif.Image.DateTime')
@@ -77,7 +78,6 @@ def image_gid(path, date=None,
 			date = os.path.getctime(path)
 			data['date'] = time.strftime(date_format, time.gmtime(date))
 		else:
-##			date = i['Exif.Image.DateTime'].value
 			date = i['Exif.Photo.DateTimeOriginal'].value
 			data['date'] = date.strftime(date_format)
 	# check if we need an artist...
@@ -85,7 +85,10 @@ def image_gid(path, date=None,
 		data['artist'] = default_artist
 		if i is not None:
 			try:
-				data['artist'] = i['Exif.Image.Artist'].value.strip().replace(' ', '_')
+				# set the artist if in EXIF...
+				a = i['Exif.Image.Artist'].value.strip().replace(' ', '_')
+				if a != '':
+					data['artist'] = a
 			except KeyError:
 				pass
 	
diff --git a/index2.py b/index2.py
index 01afbc2e..27ba89ce 100755
--- a/index2.py
+++ b/index2.py
@@ -1,7 +1,7 @@
 #=======================================================================
 
 __version__ = '''0.0.01'''
-__sub_version__ = '''20130319151025'''
+__sub_version__ = '''20130326030151'''
 __copyright__ = '''(c) Alex A. Naanou 2011'''
 
 
@@ -83,6 +83,9 @@ TYPES = {
 	'xmp': XMP,
 }
 
+SKIP_DIRS = '.sys2'
+SKIP_MARKER = '.skipindexing'
+
 
 SUBTREE_CLASSES = {
 	'preview': 'preview', 
@@ -97,12 +100,22 @@ SUBTREE_CLASSES = {
 #----------------------------------------------------------list_files---
 ##!!! we will need to normalize the paths to one single scheme (either relative or absolute)...
 # XXX might need to fetch file data too...
-def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM, include_root_path=False, include_ctime=True):
+def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM, 
+		include_root_path=False, include_ctime=True, 
+		skip_marker=SKIP_MARKER, skip_dirs=SKIP_DIRS):
 	'''
 	yields:
 		(<path>, <name>, <ext>[, <ctime>]),
 	'''
 	for orig_path, dirs, files in os.walk(root):
+		# skip dir trees containing skip_filename...
+		if skip_marker in files:
+			del dirs[:]
+			continue
+		# skip dirs...
+		while skip_dirs in dirs:
+			dirs.remove(skip_dirs)
+
 		# XXX is this correct...
 		path = orig_path.split(os.path.sep)
 		# remove root from path...
@@ -125,6 +138,7 @@ def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM, include_root_path=Fal
 #----------------------------------------------------------common_len---
 def common_len(a, *b):
 	'''
+	calculate the common path length.
 	'''
 	for i, l in enumerate(izip(*(a,) + b)):
 		if len(set(l)) != 1:
@@ -174,13 +188,12 @@ def split_by_raws(raws, lst, failed):
 	'''
 	'''
 ##	raws = [e for e in lst if e[2] == RAW] 
+	# top level common path...
 	common = common_len(*[ e[0] for e in raws ])
 
 	# NOTE: do not change the order of raws after this point
 	# 		and till the end of the loop...
 	# 		XXX revise if there is a simpler way...
-	##!!! this kills code like sets[0][1] += [...]
-##	sets = [ (r, [r]) for r in raws ]
 	sets = [ [r, [r]] for r in raws ]
 
 	for e in lst:
@@ -199,7 +212,6 @@ def split_by_raws(raws, lst, failed):
 			failed += [e]
 		# found a location...
 		elif c > common:
-			##!!! for some odd reason this does not work....
 			sets[i][1] += [e]
 		# file in an odd location ##!!! list these locations...
 		else:
@@ -207,14 +219,15 @@ def split_by_raws(raws, lst, failed):
 			##!!! try different strategies here...
 			##!!!
 			failed += [e]
-##	return sets, failed
 	return sets
 
 
 #-----------------------------------------------------------gid_index---
+##!!! this will rewrite existing data -- should only update...
 def gid_index(index, existing=None):
 	'''
 	'''
+	skipped = []
 	# index via a propper GID...
 	# split similarly named but different files...
 	if existing is None:
@@ -222,34 +235,41 @@ def gid_index(index, existing=None):
 	else:
 		res = existing
 	failed = []
+	im_n = 0
+	up_n = 0
+	new_n = 0
+
 	for name, l in index.iteritems():
 		l.sort()
 		raws = [e for e in l if e[2] == RAW] 
 
 		# multiple raw files...
 		if len(raws) > 1:
-			# spit this into a seporate func...
 			sets = split_by_raws(raws, l, failed)
 		# single raw...
 		elif len(raws) == 1:
 			sets = [(raws[0], l)]
 		# no raw files...
 		else:
-			print 'no raw file found for "%s"...' % os.path.join(name)
+			print (' '*78), '\rno raw file found for "%s"...' % os.path.join(name)
 			sets = []
 			##!!! need to report this in a usable way...
 			failed += l
 
 		# add actual elements to index...
 		for raw, l in sets:
+			im_n += 1
+			print 'Processing image:', im_n, 'new:', new_n, 'updated:', up_n, '\r',
+
 			# get file GID...
 			GID = image_gid('%s.%s' % (os.path.join(*[config['ARCHIVE_ROOT']] + raw[0] + [raw[1]]), raw[2]))
 
 			##!!! normalize the image format...
-			res[GID] = {
+			img = {
 				'gid': GID,
 				'name': name,
 				'imported': time.time(),
+				'updated': time.time(),
 				# NOTE: this might get distorted on archiving or
 				# 		copying...
 				# 		mostly intended for importing...
@@ -262,8 +282,30 @@ def gid_index(index, existing=None):
 				'TIFF': [e for e in l if e[2] == TIFF],
 				'other': [e for e in l if e[2] != OR(TIFF, PSD, JPEG, XMP, RAW)],
 			}
+			# add new data...
+			if GID not in res:
+				res[GID] = img
+				new_n += 1
+			# update existing...
+			else:
+				cur = res[GID]
+				updating = False
+				for k, v in img.iteritems():
+					# skip 
+					if k in ('imported', 'name', 'gid', 'ctime', 'updated'):
+						continue
+					if v != cur[k]:
+						cur[k] = v
+						updating = True
+				# do the actual update...
+				if updating:
+					cur['updated'] = time.time()
+					res[GID] = cur
+					up_n += 1
+				else:
+					skipped += [GID]
 
-	return res, failed
+	return res, failed, skipped
 
 
 
@@ -282,7 +324,7 @@ if __name__ == '__main__':
 		lst = list(list_files(config['ARCHIVE_ROOT']))
 	
 		print 'found files:', len(lst)
-		pprint(lst[0])
+##		pprint(lst[0])
 	
 		json.dump(lst, file(FILE_LIST, 'w'))
 		print 'saved...'
@@ -315,9 +357,26 @@ if __name__ == '__main__':
 ##	GID_index = store.IndexWithCache(INDEX_PATH)
 	GID_index = store.Index(INDEX_PATH)
 
-	##!!! only check for updates...
+	# a cheating waw to say if we are empty...
+	index_empty = True
+	for k in GID_index.iterkeys():
+		index_empty = False
+		break
 
-	GID_index, failed = gid_index(index, GID_index)
+	t0 = time.time()
+
+	if not index_empty:
+		print 'updating...'
+		##!!! this takes a substantially longer time initially... (about 30x longer)
+		GID_index, failed, skipped = gid_index(index, GID_index)
+	else:
+		print 'indexing...'
+		GID_index, failed, skipped = gid_index(index)
+		store.dump(GID_index, INDEX_PATH, index_depth=2)
+
+	t1 = time.time()
+
+	print 'done in:', t1-t0, 'seconds.'
 
 	json.dump(failed, file(os.path.join('test', 'failed-to-categorise.json'), 'w'))
 
@@ -336,13 +395,15 @@ if __name__ == '__main__':
 	indexed: %s
 	raws: %s
 	failed: %s
+	skipped: %s
 	''' % (
 			len(GID_index), 
 			len([ e for e in lst if e[2] == RAW]), 
-			len(failed))
+			len(failed),
+			len(skipped))
 
-	##!!! this is really slow because it pulls ALL the data... wonder who wrote this? :)
-	pprint(GID_index.itervalues().next())
+##	##!!! this is really slow because it pulls ALL the data... wonder who wrote this? :)
+##	pprint(GID_index.itervalues().next())
 
 ##	store.dump(GID_index, INDEX_PATH)
 
diff --git a/locate.py b/locate.py
index 5ddd20db..19400aa4 100755
--- a/locate.py
+++ b/locate.py
@@ -1,7 +1,7 @@
 #=======================================================================
 
 __version__ = '''0.0.01'''
-__sub_version__ = '''20130322155314'''
+__sub_version__ = '''20130325114759'''
 __copyright__ = '''(c) Alex A. Naanou 2011'''
 
 
@@ -20,9 +20,6 @@ import store
 CONFIG_NAME = 'P7000_config.json'
 
 
-#-----------------------------------------------------------------------
-
-
 #-----------------------------------------------------------------------
 if __name__ == '__main__':
 	from optparse import OptionParser
diff --git a/store.py b/store.py
index be871a76..c06fc0ca 100755
--- a/store.py
+++ b/store.py
@@ -1,7 +1,7 @@
 #=======================================================================
 
 __version__ = '''0.0.01'''
-__sub_version__ = '''20130319150549'''
+__sub_version__ = '''20130325170937'''
 __copyright__ = '''(c) Alex A. Naanou 2011'''
 
 
@@ -10,6 +10,7 @@ __copyright__ = '''(c) Alex A. Naanou 2011'''
 import os
 import json
 import zipfile
+import time
 
 import pli.pattern.mixin.mapping as mapping
 import pli.objutils as objutils
@@ -67,6 +68,7 @@ def dump(index, path, index_depth=1, ext='.json'):
 
 
 #----------------------------------------------------------------load---
+##!!! make an iterator version...
 def load(path, ext='.json', pack_ext='.pack'):
 	'''
 	load data from fs store.
@@ -99,14 +101,23 @@ def load(path, ext='.json', pack_ext='.pack'):
 # 	  only the last is accesible but this might cause trouble elsewhere...
 # NOTE: this should be done in the background (possible race-condition
 # 		with removing a file while it is being read)
-def pack(path, ext='.json', pack_ext='.pack', keep_files=False, keep_dirs=False):
+def pack(path, pack_name='%(timestamp)s', ext='.json', pack_ext='.pack', 
+		keep_files=False, keep_dirs=False, date_format='%Y%m%d-%H%M%S'):
 	'''
 	pack an fs data store.
 
+	Supported fields in pack_name:
+		%(timestamp)s		- time stamp in the date_format format
+
 	NOTE: if keep_files is True, keep_dirs option will be ignored.
+	NOTE: if pack_name is static and a pack file with that name exists 
+			then the files will be added to that pack.
 	'''
+	data = {
+		'timestamp': time.strftime(date_format),
+	}
 	##!!! this will not remove original entries if they exist...
-	z = zipfile.ZipFile(os.path.join(path, 'index' + pack_ext), 'a', compression=zipfile.ZIP_DEFLATED)
+	z = zipfile.ZipFile(os.path.join(path, (pack_name % data) + pack_ext), 'a', compression=zipfile.ZIP_DEFLATED)
 	for p, _, files in os.walk(path):
 		for f in files: 
 			if f.endswith(ext):
@@ -125,6 +136,24 @@ def pack(path, ext='.json', pack_ext='.pack', keep_files=False, keep_dirs=False)
 	z.close()
 	
 
+#-----------------------------------------------------------cleanpack---
+def cleanpack(path, pack_name='%(timestamp)s', ext='.json', pack_ext='.pack', 
+		keep_files=False, keep_dirs=False, date_format='%Y%m%d-%H%M%S'):
+	'''
+	make a clean pack, removing duplicate enteries.
+	'''
+	data = {
+		'timestamp': time.strftime(date_format),
+	}
+	name = os.path.join(path, (pack_name % data) + pack_ext)
+	##!!! this will load the whole monster to memory, need something better...
+	index = load(path)
+	z = zipfile.ZipFile(name, 'w', compression=zipfile.ZIP_DEFLATED)
+	for k, v in index.iteritems():
+		z.writestr(k + ext, json.dumps(v, indent=4, separators=(', ', ': ')))
+	z.close()
+
+
 
 #-----------------------------------------------------------------------
 # lazy dict-like objects that read and write (optional) the fs...
@@ -216,6 +245,16 @@ class Index(mapping.Mapping):
 					yield os.path.splitext(name)[0]
 
 
+#-------------------------------------------------------IndexWtihPack---
+class IndexWtihPack(object):
+	'''
+	'''
+	def pack(self):
+		'''
+		pack the index.
+		'''
+		pack(self._path)
+		
 
 #-----------------------------------------------------------------------
 REMOVED = object()
@@ -286,6 +325,22 @@ class IndexWithCache(Index):
 		del self._cache
 
 
+#---------------------------------------------------IndexWithSubIndex---
+##class IndexWithSubIndex(Index):
+##	'''
+##	'''
+##	def indexby(self, attr):
+##		'''
+##		'''
+##		self._sub_indexs
+##		for e in self:
+##			pass
+##	def getby(self, attr, value):
+##		'''
+##		'''
+##		pass
+
+
 
 #=======================================================================
 #                                            vim:set ts=4 sw=4 nowrap :