ImageGrid/index.py

#=======================================================================

__version__ = '''0.0.01'''
__sub_version__ = '''20111110190116'''
__copyright__ = '''(c) Alex A. Naanou 2011'''


#-----------------------------------------------------------------------

import os
import json
import uuid

from itertools import izip, izip_longest

from pli.logictypes import ANY, OR


#-----------------------------------------------------------------------

CONFIG_NAME = 'test_config.json'

config = json.load(open(CONFIG_NAME))

RAW = OR(
	'NEF', 'nef', 
	'CRW', 'crw',
	'CR2', 'cr2',
	'X3F', 'x3f'
)

JPEG = OR(
	'JPG', 'jpg', 
	'JPEG', 'jpeg'
)

PSD = OR(
	'PSD', 'psd'
)

TIFF = OR(
	'TIFF', 'tiff', 
	'TIF', 'tif'
)

XMP = OR(
	'XMP', 'xmp'
)

ITEM = OR(RAW, JPEG, PSD, TIFF, XMP)

TYPES = {
	'raw': RAW,
	'jpeg': JPEG,
	'psd': PSD,
	'tiff': TIFF,
	'xmp': XMP,
}


SUBTREE_CLASSES = {
	'preview': 'preview', 
	'preview (RAW)': 'RAW preview', 
}


#-----------------------------------------------------------------------

##!!! we will need to normalize the paths to one single scheme (either relative or absolute)...
def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM):
	'''
	'''
	for path, dirs, files in os.walk(root):
		path = path.split(os.path.sep)
		# process files...
		for f in files:
			name, ext = os.path.splitext(f)
			# we need the extension wothout the dot...
			ext = ext[1:]
			# filter by ext...
			if ext == type:
				yield path, name, ext


# XXX need to split duplicate named raw files and corresponding
# 	  previews...
def index_by_name(file_list, types=TYPES.items()):
	'''
	format:
		{
			<name>: {
				<ext>: [
					<path>,
					...
				],
				...
			},
			...
		}
	'''
	res = {}
	for path, name, ext in file_list:
		# normalize extension...
		ext = types[types.index((ANY, ext))][0]
		if name not in res:
			# create a name...
			res[name] = {}
		if ext not in res[name]:
			# create an extension...
			res[name][ext] = []
		# general case...
##		res[name][ext] += [(path, name, ext)]
		res[name][ext] += [path]
	return res


# for this to work correctly it must:
# 	- return unique paths
# 	- non of the returnd paths can be a strict subset of any other...
##!!!
def split_common(paths):
	'''
	'''
	# pass 1: build list of common paths (None for all differences)
	# NOTE: we may have stray common path elements but we do
	# 		not care abut anything after a None...
	index = izip_longest(*paths)
	common = []
	for s in index:
		next = []
		for i in s:
			if s.count(i) > 1:
				next += [i]
			else:
				next += [None]
		common += [next]
	# pass 2: cap each common section with a unique element...
	common = [ list(e) for e in izip(*common)]
	for c, p in izip(common, paths):
		if None in c:
			i = c.index(None)
			if len(p) <= i:
				# NOTE: this is the case when we have a None 
				# 		because a path just ended... i.e. there 
				# 		was no different element to split at...
				# XXX do we need to break here?
				# XXX one way to go here is to simply ignore
				# 	  such paths...
				del c[i]
				continue
			# in-place update and truncate the common path...
			c[i] = p[i]
			del c[i+1:]
	return common

# in essance this need to replace image name with a GID and split up
# images that are identically named into seporate GIDs...
def split_images(index):
	'''
	'''
	for name, data in index.items():
		# this will not let us lose the name of the image...
		data['name'] = name
		raw = data['raw']
		if len(raw) > 1:
			# split the images...
			# split images via closeness to one of the raw files...
			# XXX the simple way to split files is to remove the
			# 	  common part of the path between two raw files and
			# 	  then split the other files by root of the
			# 	  subtree.
			# 	  this will not work in one case:
			# 	  	- at least two of the raw files are in a deeper
			# 	  	  subtree than the other accompanying files.
			# 	  	  in this case wa can not use the topology to
			# 	  	  decide which is wich and need either to use
			# 	  	  some other means or to go inside the image...
			#
			# way to do this:
			# 	- build a subtree map -- list of paths until the
			# 	  first unique directory
			# 	- split files by subtree path
			# 	- use a different strategy for files that are above
			# 	  the subtrees...

			common = split_common(raw)

			# prepare the return structure...
			res = []
			for path in raw:
				##!!!
				res += [{
					'gid': uuid.uuid4(),
					'name': name,
##					'raw': [path],
				}]
			# start splitting the data...
			for ext, paths in data.items():
				# skip non-type fields...
				if ext not in TYPES:
					continue
				for path in paths:
					matches = []
					for i, c in enumerate(common):
						if path[:len(c)] == c:
							matches += [(len(c), i)]
					# multiple matches...
					if len(matches) > 1:
						matches.sort(key=lambda e: e[0])
						if matches[0][0] == matches[1][0]:
							# XXX we could try and use a different
							# 	  strategy...
							##!!! do a better error...
							raise Exception, 'got two targets with same score, can\'t decide where to put the file.'
						del matches[1:]
					if len(matches) == 1:
						i = matches[0][1]
						# we found a location...
						if ext not in res[i]:
							res[i][ext] = []
						res[i][ext] += [path]
					else:
						# XXX ungrouped...
						print '!!!!', path, name, ext
						raise Exception, 'still got ungrouped files...'

			# yield the results...
			for e in res:
				yield e['gid'], res
		else:
			yield uuid.uuid4(), data


#-----------------------------------------------------------------------
if __name__ == '__main__':
	lst = list(list_files(config['ARCHIVE_ROOT']))

	print len(lst)

	index = index_by_name(list_files(config['ARCHIVE_ROOT']))

	print len(index)

	json.dump(index, file(os.path.join('test', 'filelist.json'), 'w'))

	index = list(split_images(index_by_name(list_files(config['ARCHIVE_ROOT']))))

	print len(index)


#=======================================================================
#                                            vim:set ts=4 sw=4 nowrap :
started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00			`#=======================================================================`

			`__version__ = '''0.0.01'''`
grouping almost done, still needs cleaningup and revising... (also think of better strategies to resolve and test corner cases) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 19:02:26 +04:00			`__sub_version__ = '''20111110190116'''`
started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00			`__copyright__ = '''(c) Alex A. Naanou 2011'''`


			`#-----------------------------------------------------------------------`

			`import os`
			`import json`
got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`import uuid`
started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00
got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`from itertools import izip, izip_longest`

			`from pli.logictypes import ANY, OR`
started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00

			`#-----------------------------------------------------------------------`

got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`CONFIG_NAME = 'test_config.json'`
started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00
			`config = json.load(open(CONFIG_NAME))`

got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`RAW = OR(`
			`'NEF', 'nef',`
			`'CRW', 'crw',`
			`'CR2', 'cr2',`
			`'X3F', 'x3f'`
			`)`

			`JPEG = OR(`
			`'JPG', 'jpg',`
			`'JPEG', 'jpeg'`
			`)`

			`PSD = OR(`
			`'PSD', 'psd'`
			`)`

			`TIFF = OR(`
			`'TIFF', 'tiff',`
			`'TIF', 'tif'`
started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00			`)`

got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`XMP = OR(`
			`'XMP', 'xmp'`
			`)`

			`ITEM = OR(RAW, JPEG, PSD, TIFF, XMP)`

			`TYPES = {`
			`'raw': RAW,`
			`'jpeg': JPEG,`
			`'psd': PSD,`
			`'tiff': TIFF,`
			`'xmp': XMP,`
			`}`


started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00			`SUBTREE_CLASSES = {`
			`'preview': 'preview',`
			`'preview (RAW)': 'RAW preview',`
			`}`


			`#-----------------------------------------------------------------------`

got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`##!!! we will need to normalize the paths to one single scheme (either relative or absolute)...`
			`def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM):`
started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00			`'''`
			`'''`
			`for path, dirs, files in os.walk(root):`
got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`path = path.split(os.path.sep)`
started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00			`# process files...`
			`for f in files:`
got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`name, ext = os.path.splitext(f)`
			`# we need the extension wothout the dot...`
			`ext = ext[1:]`
started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00			`# filter by ext...`
got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`if ext == type:`
			`yield path, name, ext`


			`# XXX need to split duplicate named raw files and corresponding`
			`# previews...`
			`def index_by_name(file_list, types=TYPES.items()):`
			`'''`
			`format:`
			`{`
			`<name>: {`
			`<ext>: [`
			`<path>,`
			`...`
			`],`
			`...`
			`},`
			`...`
			`}`
			`'''`
			`res = {}`
			`for path, name, ext in file_list:`
			`# normalize extension...`
			`ext = types[types.index((ANY, ext))][0]`
			`if name not in res:`
			`# create a name...`
			`res[name] = {}`
			`if ext not in res[name]:`
			`# create an extension...`
			`res[name][ext] = []`
			`# general case...`
			`## res[name][ext] += [(path, name, ext)]`
			`res[name][ext] += [path]`
			`return res`


			`# for this to work correctly it must:`
			`# - return unique paths`
			`# - non of the returnd paths can be a strict subset of any other...`
			`##!!!`
			`def split_common(paths):`
			`'''`
			`'''`
			`# pass 1: build list of common paths (None for all differences)`
			`# NOTE: we may have stray common path elements but we do`
			`# not care abut anything after a None...`
			`index = izip_longest(*paths)`
			`common = []`
			`for s in index:`
			`next = []`
			`for i in s:`
			`if s.count(i) > 1:`
			`next += [i]`
			`else:`
			`next += [None]`
			`common += [next]`
			`# pass 2: cap each common section with a unique element...`
			`common = [ list(e) for e in izip(*common)]`
			`for c, p in izip(common, paths):`
			`if None in c:`
			`i = c.index(None)`
			`if len(p) <= i:`
			`# NOTE: this is the case when we have a None`
			`# because a path just ended... i.e. there`
			`# was no different element to split at...`
			`# XXX do we need to break here?`
			`# XXX one way to go here is to simply ignore`
			`# such paths...`
grouping almost done, still needs cleaningup and revising... (also think of better strategies to resolve and test corner cases) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 19:02:26 +04:00			`del c[i]`
got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`continue`
			`# in-place update and truncate the common path...`
			`c[i] = p[i]`
			`del c[i+1:]`
			`return common`

			`# in essance this need to replace image name with a GID and split up`
			`# images that are identically named into seporate GIDs...`
			`def split_images(index):`
			`'''`
			`'''`
			`for name, data in index.items():`
			`# this will not let us lose the name of the image...`
			`data['name'] = name`
			`raw = data['raw']`
			`if len(raw) > 1:`
			`# split the images...`
			`# split images via closeness to one of the raw files...`
			`# XXX the simple way to split files is to remove the`
			`# common part of the path between two raw files and`
			`# then split the other files by root of the`
			`# subtree.`
			`# this will not work in one case:`
			`# - at least two of the raw files are in a deeper`
			`# subtree than the other accompanying files.`
			`# in this case wa can not use the topology to`
			`# decide which is wich and need either to use`
			`# some other means or to go inside the image...`
			`#`
			`# way to do this:`
			`# - build a subtree map -- list of paths until the`
			`# first unique directory`
			`# - split files by subtree path`
			`# - use a different strategy for files that are above`
			`# the subtrees...`

			`common = split_common(raw)`

			`# prepare the return structure...`
			`res = []`
			`for path in raw:`
			`##!!!`
			`res += [{`
			`'gid': uuid.uuid4(),`
			`'name': name,`
			`## 'raw': [path],`
			`}]`
			`# start splitting the data...`
			`for ext, paths in data.items():`
grouping almost done, still needs cleaningup and revising... (also think of better strategies to resolve and test corner cases) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 19:02:26 +04:00			`# skip non-type fields...`
got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`if ext not in TYPES:`
			`continue`
			`for path in paths:`
grouping almost done, still needs cleaningup and revising... (also think of better strategies to resolve and test corner cases) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 19:02:26 +04:00			`matches = []`
got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`for i, c in enumerate(common):`
			`if path[:len(c)] == c:`
grouping almost done, still needs cleaningup and revising... (also think of better strategies to resolve and test corner cases) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 19:02:26 +04:00			`matches += [(len(c), i)]`
			`# multiple matches...`
			`if len(matches) > 1:`
			`matches.sort(key=lambda e: e[0])`
			`if matches[0][0] == matches[1][0]:`
			`# XXX we could try and use a different`
			`# strategy...`
			`##!!! do a better error...`
			`raise Exception, 'got two targets with same score, can\'t decide where to put the file.'`
			`del matches[1:]`
got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`if len(matches) == 1:`
grouping almost done, still needs cleaningup and revising... (also think of better strategies to resolve and test corner cases) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 19:02:26 +04:00			`i = matches[0][1]`
got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`# we found a location...`
			`if ext not in res[i]:`
			`res[i][ext] = []`
			`res[i][ext] += [path]`
			`else:`
			`# XXX ungrouped...`
			`print '!!!!', path, name, ext`
grouping almost done, still needs cleaningup and revising... (also think of better strategies to resolve and test corner cases) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 19:02:26 +04:00			`raise Exception, 'still got ungrouped files...'`
got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00
			`# yield the results...`
			`for e in res:`
			`yield e['gid'], res`
			`else:`
			`yield uuid.uuid4(), data`
started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00


			`#-----------------------------------------------------------------------`
			`if __name__ == '__main__':`
			`lst = list(list_files(config['ARCHIVE_ROOT']))`

			`print len(lst)`

got the grouping mostly working. corner cases still fail (run index.py to see the ungrouped files) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-10 18:43:46 +04:00			`index = index_by_name(list_files(config['ARCHIVE_ROOT']))`

			`print len(index)`

			`json.dump(index, file(os.path.join('test', 'filelist.json'), 'w'))`

			`index = list(split_images(index_by_name(list_files(config['ARCHIVE_ROOT']))))`

			`print len(index)`
started work on indexing... Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com> 2011-11-03 01:11:10 +03:00


			`#=======================================================================`
			`# vim:set ts=4 sw=4 nowrap :`