got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files)

Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com>
2025-10-28 18:00:09 +00:00 · 2011-11-10 18:43:46 +04:00 · 2011-11-10 18:43:46 +04:00 · 0816077356
commit 0816077356
parent ebd2865fed
1 changed files with 198 additions and 23 deletions
--- a/index.py
+++ b/index.py
@ -1,7 +1,7 @@
 #=======================================================================

 __version__ = '''0.0.01'''
-__sub_version__ = '''20111103010916'''
+__sub_version__ = '''20111110184147'''
 __copyright__ = '''(c) Alex A. Naanou 2011'''


@ -9,28 +9,55 @@ __copyright__ = '''(c) Alex A. Naanou 2011'''

 import os
 import json
+import uuid

-from pli.logictypes import OR
+from itertools import izip, izip_longest
+
+from pli.logictypes import ANY, OR


 #-----------------------------------------------------------------------

-CONFIG_NAME = 'config.json'
+CONFIG_NAME = 'test_config.json'

 config = json.load(open(CONFIG_NAME))

-ITEM_EXTENSIONS = (
-	# RAW formats...
-	'NEF', 'nef',
-	# JPEGs...
-	'JPG', 'JPEG', 'jpg', 'jpeg',
-	# Editid images...
-	'PSD', 'psd',
-	'TIFF', 'tiff', 'TIF', 'tif',
-	# metadata sidecar files...
-	'XMP', 'xmp',
+RAW = OR(
+	'NEF', 'nef', 
+	'CRW', 'crw',
+	'CR2', 'cr2',
+	'X3F', 'x3f'
 )

+JPEG = OR(
+	'JPG', 'jpg', 
+	'JPEG', 'jpeg'
+)
+
+PSD = OR(
+	'PSD', 'psd'
+)
+
+TIFF = OR(
+	'TIFF', 'tiff', 
+	'TIF', 'tif'
+)
+
+XMP = OR(
+	'XMP', 'xmp'
+)
+
+ITEM = OR(RAW, JPEG, PSD, TIFF, XMP)
+
+TYPES = {
+	'raw': RAW,
+	'jpeg': JPEG,
+	'psd': PSD,
+	'tiff': TIFF,
+	'xmp': XMP,
+}
+
+
 SUBTREE_CLASSES = {
 	'preview': 'preview', 
 	'preview (RAW)': 'RAW preview', 
@ -39,22 +66,161 @@ SUBTREE_CLASSES = {

 #-----------------------------------------------------------------------

-def list_files(root, sub_trees=SUBTREE_CLASSES, ext=OR(*ITEM_EXTENSIONS)):
+##!!! we will need to normalize the paths to one single scheme (either relative or absolute)...
+def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM):
 	'''
 	'''
 	for path, dirs, files in os.walk(root):
-		# clasify by subtree...
-		p = os.path.split(path)
-		subtree_type = None 
-		for t in sub_trees:
-			if t in p:
-				subtree_type = sub_trees[t]
-				break
+		path = path.split(os.path.sep)
 		# process files...
 		for f in files:
+			name, ext = os.path.splitext(f)
+			# we need the extension wothout the dot...
+			ext = ext[1:]
 			# filter by ext...
-			if f.split('.')[-1] == ext:
-				yield subtree_type, path, f
+			if ext == type:
+				yield path, name, ext
+
+
+# XXX need to split duplicate named raw files and corresponding
+# 	  previews...
+def index_by_name(file_list, types=TYPES.items()):
+	'''
+	format:
+		{
+			<name>: {
+				<ext>: [
+					<path>,
+					...
+				],
+				...
+			},
+			...
+		}
+	'''
+	res = {}
+	for path, name, ext in file_list:
+		# normalize extension...
+		ext = types[types.index((ANY, ext))][0]
+		if name not in res:
+			# create a name...
+			res[name] = {}
+		if ext not in res[name]:
+			# create an extension...
+			res[name][ext] = []
+		# general case...
+##		res[name][ext] += [(path, name, ext)]
+		res[name][ext] += [path]
+	return res
+
+
+# for this to work correctly it must:
+# 	- return unique paths
+# 	- non of the returnd paths can be a strict subset of any other...
+##!!!
+def split_common(paths):
+	'''
+	'''
+	# pass 1: build list of common paths (None for all differences)
+	# NOTE: we may have stray common path elements but we do
+	# 		not care abut anything after a None...
+	index = izip_longest(*paths)
+	common = []
+	for s in index:
+		next = []
+		for i in s:
+			if s.count(i) > 1:
+				next += [i]
+			else:
+				next += [None]
+		common += [next]
+	# pass 2: cap each common section with a unique element...
+	common = [ list(e) for e in izip(*common)]
+	for c, p in izip(common, paths):
+		if None in c:
+			i = c.index(None)
+			if len(p) <= i:
+				# NOTE: this is the case when we have a None 
+				# 		because a path just ended... i.e. there 
+				# 		was no different element to split at...
+				# XXX do we need to break here?
+				# XXX one way to go here is to simply ignore
+				# 	  such paths...
+				##!!! XXX we will leave a None at the end of such paths for now...
+##				del c[i]
+				continue
+			# in-place update and truncate the common path...
+			c[i] = p[i]
+			del c[i+1:]
+	return common
+
+# in essance this need to replace image name with a GID and split up
+# images that are identically named into seporate GIDs...
+def split_images(index):
+	'''
+	'''
+	for name, data in index.items():
+		# this will not let us lose the name of the image...
+		data['name'] = name
+		raw = data['raw']
+		if len(raw) > 1:
+			# split the images...
+			# split images via closeness to one of the raw files...
+			# XXX the simple way to split files is to remove the
+			# 	  common part of the path between two raw files and
+			# 	  then split the other files by root of the
+			# 	  subtree.
+			# 	  this will not work in one case:
+			# 	  	- at least two of the raw files are in a deeper
+			# 	  	  subtree than the other accompanying files.
+			# 	  	  in this case wa can not use the topology to
+			# 	  	  decide which is wich and need either to use
+			# 	  	  some other means or to go inside the image...
+			#
+			# way to do this:
+			# 	- build a subtree map -- list of paths until the
+			# 	  first unique directory
+			# 	- split files by subtree path
+			# 	- use a different strategy for files that are above
+			# 	  the subtrees...
+
+			common = split_common(raw)
+
+			# prepare the return structure...
+			res = []
+			for path in raw:
+				##!!!
+				res += [{
+					'gid': uuid.uuid4(),
+					'name': name,
+##					'raw': [path],
+				}]
+			# start splitting the data...
+			for ext, paths in data.items():
+				if ext not in TYPES:
+					continue
+				for path in paths:
+					matches = {}
+					for i, c in enumerate(common):
+						if path[:len(c)] == c:
+							matches[i] = len(c)
+					if len(matches) == 1:
+						i = matches.keys()[0]
+						# we found a location...
+						if ext not in res[i]:
+							res[i][ext] = []
+						res[i][ext] += [path]
+					elif len(matches) > 1:
+						raise Exception, 'got %s matches.' % len(matches)
+					else:
+						# XXX ungrouped...
+						print '!!!!', path, name, ext
+
+			# yield the results...
+			for e in res:
+				yield e['gid'], res
+		else:
+			yield uuid.uuid4(), data



@ -64,6 +230,15 @@ if __name__ == '__main__':

 	print len(lst)

+	index = index_by_name(list_files(config['ARCHIVE_ROOT']))
+
+	print len(index)
+
+	json.dump(index, file(os.path.join('test', 'filelist.json'), 'w'))
+
+	index = list(split_images(index_by_name(list_files(config['ARCHIVE_ROOT']))))
+
+	print len(index)