From 0816077356f8c22bf5b10f7ff7412691b3b016a5 Mon Sep 17 00:00:00 2001
From: "Alex A. Naanou" <alex.nanou@gmail.com>
Date: Thu, 10 Nov 2011 18:43:46 +0400
Subject: [PATCH] got the grouping mostly working. corner cases still fail (run
 index.py to see the ungrouped files)

Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com>
---
 index.py | 221 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 198 insertions(+), 23 deletions(-)
diff --git a/index.py b/index.py
index 0b8c7f3e..2e6daf34 100755
--- a/index.py
+++ b/index.py
@@ -1,7 +1,7 @@
 #=======================================================================
 
 __version__ = '''0.0.01'''
-__sub_version__ = '''20111103010916'''
+__sub_version__ = '''20111110184147'''
 __copyright__ = '''(c) Alex A. Naanou 2011'''
 
 
@@ -9,28 +9,55 @@ __copyright__ = '''(c) Alex A. Naanou 2011'''
 
 import os
 import json
+import uuid
 
-from pli.logictypes import OR
+from itertools import izip, izip_longest
+
+from pli.logictypes import ANY, OR
 
 
 #-----------------------------------------------------------------------
 
-CONFIG_NAME = 'config.json'
+CONFIG_NAME = 'test_config.json'
 
 config = json.load(open(CONFIG_NAME))
 
-ITEM_EXTENSIONS = (
-	# RAW formats...
-	'NEF', 'nef',
-	# JPEGs...
-	'JPG', 'JPEG', 'jpg', 'jpeg',
-	# Editid images...
-	'PSD', 'psd',
-	'TIFF', 'tiff', 'TIF', 'tif',
-	# metadata sidecar files...
-	'XMP', 'xmp',
+RAW = OR(
+	'NEF', 'nef', 
+	'CRW', 'crw',
+	'CR2', 'cr2',
+	'X3F', 'x3f'
 )
 
+JPEG = OR(
+	'JPG', 'jpg', 
+	'JPEG', 'jpeg'
+)
+
+PSD = OR(
+	'PSD', 'psd'
+)
+
+TIFF = OR(
+	'TIFF', 'tiff', 
+	'TIF', 'tif'
+)
+
+XMP = OR(
+	'XMP', 'xmp'
+)
+
+ITEM = OR(RAW, JPEG, PSD, TIFF, XMP)
+
+TYPES = {
+	'raw': RAW,
+	'jpeg': JPEG,
+	'psd': PSD,
+	'tiff': TIFF,
+	'xmp': XMP,
+}
+
+
 SUBTREE_CLASSES = {
 	'preview': 'preview', 
 	'preview (RAW)': 'RAW preview', 
@@ -39,22 +66,161 @@ SUBTREE_CLASSES = {
 
 #-----------------------------------------------------------------------
 
-def list_files(root, sub_trees=SUBTREE_CLASSES, ext=OR(*ITEM_EXTENSIONS)):
+##!!! we will need to normalize the paths to one single scheme (either relative or absolute)...
+def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM):
 	'''
 	'''
 	for path, dirs, files in os.walk(root):
-		# clasify by subtree...
-		p = os.path.split(path)
-		subtree_type = None 
-		for t in sub_trees:
-			if t in p:
-				subtree_type = sub_trees[t]
-				break
+		path = path.split(os.path.sep)
 		# process files...
 		for f in files:
+			name, ext = os.path.splitext(f)
+			# we need the extension wothout the dot...
+			ext = ext[1:]
 			# filter by ext...
-			if f.split('.')[-1] == ext:
-				yield subtree_type, path, f
+			if ext == type:
+				yield path, name, ext
+
+
+# XXX need to split duplicate named raw files and corresponding
+# 	  previews...
+def index_by_name(file_list, types=TYPES.items()):
+	'''
+	format:
+		{
+			<name>: {
+				<ext>: [
+					<path>,
+					...
+				],
+				...
+			},
+			...
+		}
+	'''
+	res = {}
+	for path, name, ext in file_list:
+		# normalize extension...
+		ext = types[types.index((ANY, ext))][0]
+		if name not in res:
+			# create a name...
+			res[name] = {}
+		if ext not in res[name]:
+			# create an extension...
+			res[name][ext] = []
+		# general case...
+##		res[name][ext] += [(path, name, ext)]
+		res[name][ext] += [path]
+	return res
+
+
+# for this to work correctly it must:
+# 	- return unique paths
+# 	- non of the returnd paths can be a strict subset of any other...
+##!!!
+def split_common(paths):
+	'''
+	'''
+	# pass 1: build list of common paths (None for all differences)
+	# NOTE: we may have stray common path elements but we do
+	# 		not care abut anything after a None...
+	index = izip_longest(*paths)
+	common = []
+	for s in index:
+		next = []
+		for i in s:
+			if s.count(i) > 1:
+				next += [i]
+			else:
+				next += [None]
+		common += [next]
+	# pass 2: cap each common section with a unique element...
+	common = [ list(e) for e in izip(*common)]
+	for c, p in izip(common, paths):
+		if None in c:
+			i = c.index(None)
+			if len(p) <= i:
+				# NOTE: this is the case when we have a None 
+				# 		because a path just ended... i.e. there 
+				# 		was no different element to split at...
+				# XXX do we need to break here?
+				# XXX one way to go here is to simply ignore
+				# 	  such paths...
+				##!!! XXX we will leave a None at the end of such paths for now...
+##				del c[i]
+				continue
+			# in-place update and truncate the common path...
+			c[i] = p[i]
+			del c[i+1:]
+	return common
+
+# in essance this need to replace image name with a GID and split up
+# images that are identically named into seporate GIDs...
+def split_images(index):
+	'''
+	'''
+	for name, data in index.items():
+		# this will not let us lose the name of the image...
+		data['name'] = name
+		raw = data['raw']
+		if len(raw) > 1:
+			# split the images...
+			# split images via closeness to one of the raw files...
+			# XXX the simple way to split files is to remove the
+			# 	  common part of the path between two raw files and
+			# 	  then split the other files by root of the
+			# 	  subtree.
+			# 	  this will not work in one case:
+			# 	  	- at least two of the raw files are in a deeper
+			# 	  	  subtree than the other accompanying files.
+			# 	  	  in this case wa can not use the topology to
+			# 	  	  decide which is wich and need either to use
+			# 	  	  some other means or to go inside the image...
+			#
+			# way to do this:
+			# 	- build a subtree map -- list of paths until the
+			# 	  first unique directory
+			# 	- split files by subtree path
+			# 	- use a different strategy for files that are above
+			# 	  the subtrees...
+
+			common = split_common(raw)
+
+			# prepare the return structure...
+			res = []
+			for path in raw:
+				##!!!
+				res += [{
+					'gid': uuid.uuid4(),
+					'name': name,
+##					'raw': [path],
+				}]
+			# start splitting the data...
+			for ext, paths in data.items():
+				if ext not in TYPES:
+					continue
+				for path in paths:
+					matches = {}
+					for i, c in enumerate(common):
+						if path[:len(c)] == c:
+							matches[i] = len(c)
+					if len(matches) == 1:
+						i = matches.keys()[0]
+						# we found a location...
+						if ext not in res[i]:
+							res[i][ext] = []
+						res[i][ext] += [path]
+					elif len(matches) > 1:
+						raise Exception, 'got %s matches.' % len(matches)
+					else:
+						# XXX ungrouped...
+						print '!!!!', path, name, ext
+
+			# yield the results...
+			for e in res:
+				yield e['gid'], res
+		else:
+			yield uuid.uuid4(), data
 
 
 
@@ -64,6 +230,15 @@ if __name__ == '__main__':
 
 	print len(lst)
 
+	index = index_by_name(list_files(config['ARCHIVE_ROOT']))
+
+	print len(index)
+
+	json.dump(index, file(os.path.join('test', 'filelist.json'), 'w'))
+
+	index = list(split_images(index_by_name(list_files(config['ARCHIVE_ROOT']))))
+
+	print len(index)