| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | #======================================================================= | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | __version__ = '''0.0.01''' | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | __sub_version__ = '''20130326030151''' | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | __copyright__ = '''(c) Alex A. Naanou 2011''' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #----------------------------------------------------------------------- | 
					
						
							|  |  |  | # The first index.py might be a little too complicated. try and resolve | 
					
						
							|  |  |  | # this as so: | 
					
						
							|  |  |  | # 	- list all relevant files (RAW, XMP, JPG, PSD, ...) | 
					
						
							|  |  |  | # 	- group by path (closeness) | 
					
						
							|  |  |  | # 		- deepest common path to contain all files with common name. | 
					
						
							|  |  |  | # 		  this will fail if we have different files with same names. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #----------------------------------------------------------------------- | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | import json | 
					
						
							|  |  |  | import zipfile | 
					
						
							|  |  |  | import uuid | 
					
						
							|  |  |  | import time | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | from pprint import pprint | 
					
						
							|  |  |  | from itertools import izip, izip_longest | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-02 02:13:55 +04:00
										 |  |  | import pyexiv2 as metadata | 
					
						
							| 
									
										
										
										
											2013-03-19 15:00:29 +04:00
										 |  |  | import couchdb | 
					
						
							| 
									
										
										
										
											2012-03-02 02:13:55 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | from pli.logictypes import ANY, OR | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | import store | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | from gid import image_gid | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #----------------------------------------------------------------------- | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-17 00:44:04 +04:00
										 |  |  | ##CONFIG_NAME = 'hdd9_config.json' | 
					
						
							|  |  |  | CONFIG_NAME = 'P7000_config.json' | 
					
						
							|  |  |  | ##CONFIG_NAME = 'staging_config.json' | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 
 | 
					
						
							|  |  |  | config = json.load(open(CONFIG_NAME)) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-03 02:11:19 +04:00
										 |  |  | # XXX move this to a context-dependant module... | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | RAW = OR( | 
					
						
							| 
									
										
										
										
											2012-03-20 01:39:22 +04:00
										 |  |  | 	# Nikon | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 	'NEF', 'nef',  | 
					
						
							| 
									
										
										
										
											2012-03-20 01:39:22 +04:00
										 |  |  | 	# Panasonic/Leica | 
					
						
							|  |  |  | 	'RW2', 'rw2', | 
					
						
							|  |  |  | 	# Canon | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 	'CRW', 'crw', | 
					
						
							|  |  |  | 	'CR2', 'cr2', | 
					
						
							| 
									
										
										
										
											2012-03-20 01:39:22 +04:00
										 |  |  | 	# Sigma | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 	'X3F', 'x3f', | 
					
						
							| 
									
										
										
										
											2012-03-20 01:39:22 +04:00
										 |  |  | 	# Adobe/Leica | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 	'DNG', 'dng', | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | JPEG = OR( | 
					
						
							|  |  |  | 	'JPG', 'jpg',  | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 	'JPEG', 'jpeg', | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | PSD = OR( | 
					
						
							|  |  |  | 	'PSD', 'psd' | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TIFF = OR( | 
					
						
							|  |  |  | 	'TIFF', 'tiff',  | 
					
						
							|  |  |  | 	'TIF', 'tif' | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | XMP = OR( | 
					
						
							|  |  |  | 	'XMP', 'xmp' | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ITEM = OR(RAW, JPEG, PSD, TIFF, XMP) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TYPES = { | 
					
						
							|  |  |  | 	'raw': RAW, | 
					
						
							|  |  |  | 	'jpeg': JPEG, | 
					
						
							|  |  |  | 	'psd': PSD, | 
					
						
							|  |  |  | 	'tiff': TIFF, | 
					
						
							|  |  |  | 	'xmp': XMP, | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | SKIP_DIRS = '.sys2' | 
					
						
							|  |  |  | SKIP_MARKER = '.skipindexing' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 
 | 
					
						
							|  |  |  | SUBTREE_CLASSES = { | 
					
						
							|  |  |  | 	'preview': 'preview',  | 
					
						
							|  |  |  | 	'preview (RAW)': 'RAW preview',  | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #----------------------------------------------------------------------- | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-02 02:13:55 +04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-15 15:25:03 +04:00
										 |  |  | #----------------------------------------------------------list_files--- | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | ##!!! we will need to normalize the paths to one single scheme (either relative or absolute)... | 
					
						
							| 
									
										
										
										
											2012-03-01 18:07:11 +04:00
										 |  |  | # XXX might need to fetch file data too... | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM,  | 
					
						
							|  |  |  | 		include_root_path=False, include_ctime=True,  | 
					
						
							|  |  |  | 		skip_marker=SKIP_MARKER, skip_dirs=SKIP_DIRS): | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 	'''
 | 
					
						
							|  |  |  | 	yields: | 
					
						
							| 
									
										
										
										
											2012-03-02 16:17:12 +04:00
										 |  |  | 		(<path>, <name>, <ext>[, <ctime>]), | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 	'''
 | 
					
						
							| 
									
										
										
										
											2012-03-02 16:17:12 +04:00
										 |  |  | 	for orig_path, dirs, files in os.walk(root): | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 		# skip dir trees containing skip_filename... | 
					
						
							|  |  |  | 		if skip_marker in files: | 
					
						
							|  |  |  | 			del dirs[:] | 
					
						
							|  |  |  | 			continue | 
					
						
							|  |  |  | 		# skip dirs... | 
					
						
							|  |  |  | 		while skip_dirs in dirs: | 
					
						
							|  |  |  | 			dirs.remove(skip_dirs) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 		# XXX is this correct... | 
					
						
							| 
									
										
										
										
											2012-03-02 16:17:12 +04:00
										 |  |  | 		path = orig_path.split(os.path.sep) | 
					
						
							|  |  |  | 		# remove root from path... | 
					
						
							|  |  |  | 		if not include_root_path: | 
					
						
							|  |  |  | 			path = path[len(root.split(os.path.sep)):] | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 		# process files... | 
					
						
							|  |  |  | 		for f in files: | 
					
						
							|  |  |  | 			name, ext = os.path.splitext(f) | 
					
						
							|  |  |  | 			# we need the extension wothout the dot... | 
					
						
							|  |  |  | 			ext = ext[1:] | 
					
						
							|  |  |  | 			# filter by ext... | 
					
						
							|  |  |  | 			if ext == type: | 
					
						
							| 
									
										
										
										
											2012-03-02 16:17:12 +04:00
										 |  |  | 				if include_ctime: | 
					
						
							|  |  |  | 					t = os.path.getctime(os.path.join(orig_path, f)) | 
					
						
							|  |  |  | 					yield path, name, ext, t | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 				else: | 
					
						
							|  |  |  | 					yield path, name, ext | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-15 15:25:03 +04:00
										 |  |  | #----------------------------------------------------------common_len--- | 
					
						
							| 
									
										
										
										
											2012-03-03 02:11:19 +04:00
										 |  |  | def common_len(a, *b): | 
					
						
							|  |  |  | 	'''
 | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 	calculate the common path length. | 
					
						
							| 
									
										
										
										
											2012-03-03 02:11:19 +04:00
										 |  |  | 	'''
 | 
					
						
							|  |  |  | 	for i, l in enumerate(izip(*(a,) + b)): | 
					
						
							|  |  |  | 		if len(set(l)) != 1: | 
					
						
							|  |  |  | 			return i | 
					
						
							|  |  |  | 	return len(min(*(a,) + b)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-15 15:25:03 +04:00
										 |  |  | #-------------------------------------------------------path_distance--- | 
					
						
							| 
									
										
										
										
											2012-03-03 02:11:19 +04:00
										 |  |  | ##!!! is this meaningless? | 
					
						
							|  |  |  | def path_distance(a, b): | 
					
						
							|  |  |  | 	'''
 | 
					
						
							|  |  |  | 	'''
 | 
					
						
							|  |  |  | 	return len(a) + len(b) - common_len(a, b)*2 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-15 15:25:03 +04:00
										 |  |  | #-------------------------------------------------------index_by_name--- | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | def index_by_name(lst): | 
					
						
							|  |  |  | 	'''
 | 
					
						
							|  |  |  | 	index by file name (indexing preparation)... | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	format: | 
					
						
							|  |  |  | 	{ | 
					
						
							|  |  |  | 		<name> : [ | 
					
						
							|  |  |  | 			(<path>, <name>, ...), | 
					
						
							|  |  |  | 			... | 
					
						
							|  |  |  | 		], | 
					
						
							|  |  |  | 		... | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	'''
 | 
					
						
							|  |  |  | 	res = {} | 
					
						
							|  |  |  | 	# NOTE: this is to avoid side-effects... | 
					
						
							|  |  |  | 	lst = lst[:] | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 	# sort via name, ext, path | 
					
						
							| 
									
										
										
										
											2012-03-02 16:17:12 +04:00
										 |  |  | 	lst.sort(key=lambda e: (e[1], e[2], e[0])) | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 	for e in lst: | 
					
						
							|  |  |  | 		n = e[1] | 
					
						
							|  |  |  | 		if n in res: | 
					
						
							|  |  |  | 			res[n] += [e] | 
					
						
							|  |  |  | 		else: | 
					
						
							|  |  |  | 			res[n] = [e] | 
					
						
							|  |  |  | 	return res | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-15 15:25:03 +04:00
										 |  |  | #-------------------------------------------------------split_by_raws--- | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | def split_by_raws(raws, lst, failed): | 
					
						
							|  |  |  | 	'''
 | 
					
						
							|  |  |  | 	'''
 | 
					
						
							|  |  |  | ##	raws = [e for e in lst if e[2] == RAW]  | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 	# top level common path... | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 	common = common_len(*[ e[0] for e in raws ]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	# NOTE: do not change the order of raws after this point | 
					
						
							|  |  |  | 	# 		and till the end of the loop... | 
					
						
							|  |  |  | 	# 		XXX revise if there is a simpler way... | 
					
						
							|  |  |  | 	sets = [ [r, [r]] for r in raws ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	for e in lst: | 
					
						
							|  |  |  | 		if e[2] == RAW: | 
					
						
							|  |  |  | 			continue | 
					
						
							|  |  |  | 		# check if we are closer to other raws... | 
					
						
							|  |  |  | 		# NOTE: this depends on stability of order in raws | 
					
						
							|  |  |  | 		c_index = [(common_len(r[0], e[0]), r, i) for i, r in enumerate(raws)] | 
					
						
							|  |  |  | 		c, raw, i = max(*c_index) | 
					
						
							|  |  |  | 		# we have two locations with identical weight... | 
					
						
							|  |  |  | 		if c_index.count([c, ANY, ANY]) > 1: | 
					
						
							|  |  |  | 			# a file is at a path junction exactly... | 
					
						
							|  |  |  | 			print '    !!! can\'t decide where to put %s.%s...' % (e[1], e[2]) | 
					
						
							|  |  |  | 			##!!! try different strategies here... | 
					
						
							|  |  |  | 			##!!! | 
					
						
							|  |  |  | 			failed += [e] | 
					
						
							|  |  |  | 		# found a location... | 
					
						
							|  |  |  | 		elif c > common: | 
					
						
							|  |  |  | 			sets[i][1] += [e] | 
					
						
							|  |  |  | 		# file in an odd location ##!!! list these locations... | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 		else: | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 			print '    !!! can\'t decide where to put %s.%s...' % (e[1], e[2]) | 
					
						
							|  |  |  | 			##!!! try different strategies here... | 
					
						
							|  |  |  | 			##!!! | 
					
						
							|  |  |  | 			failed += [e] | 
					
						
							|  |  |  | 	return sets | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-15 15:25:03 +04:00
										 |  |  | #-----------------------------------------------------------gid_index--- | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | ##!!! this will rewrite existing data -- should only update... | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | def gid_index(index, existing=None): | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 	'''
 | 
					
						
							|  |  |  | 	'''
 | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 	skipped = [] | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 	# index via a propper GID... | 
					
						
							|  |  |  | 	# split similarly named but different files... | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 	if existing is None: | 
					
						
							|  |  |  | 		res = {} | 
					
						
							|  |  |  | 	else: | 
					
						
							|  |  |  | 		res = existing | 
					
						
							| 
									
										
										
										
											2012-03-03 02:11:19 +04:00
										 |  |  | 	failed = [] | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 	im_n = 0 | 
					
						
							|  |  |  | 	up_n = 0 | 
					
						
							|  |  |  | 	new_n = 0 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 	for name, l in index.iteritems(): | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 		l.sort() | 
					
						
							| 
									
										
										
										
											2012-03-02 16:17:12 +04:00
										 |  |  | 		raws = [e for e in l if e[2] == RAW]  | 
					
						
							| 
									
										
										
										
											2012-03-01 18:07:11 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 		# multiple raw files... | 
					
						
							| 
									
										
										
										
											2012-03-03 02:11:19 +04:00
										 |  |  | 		if len(raws) > 1: | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 			sets = split_by_raws(raws, l, failed) | 
					
						
							| 
									
										
										
										
											2012-03-03 02:11:19 +04:00
										 |  |  | 		# single raw... | 
					
						
							|  |  |  | 		elif len(raws) == 1: | 
					
						
							|  |  |  | 			sets = [(raws[0], l)] | 
					
						
							|  |  |  | 		# no raw files... | 
					
						
							|  |  |  | 		else: | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 			print (' '*78), '\rno raw file found for "%s"...' % os.path.join(name) | 
					
						
							| 
									
										
										
										
											2012-03-03 02:11:19 +04:00
										 |  |  | 			sets = [] | 
					
						
							|  |  |  | 			##!!! need to report this in a usable way... | 
					
						
							|  |  |  | 			failed += l | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 		# add actual elements to index... | 
					
						
							| 
									
										
										
										
											2012-03-03 02:11:19 +04:00
										 |  |  | 		for raw, l in sets: | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 			im_n += 1 | 
					
						
							|  |  |  | 			print 'Processing image:', im_n, 'new:', new_n, 'updated:', up_n, '\r', | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-02 16:17:12 +04:00
										 |  |  | 			# get file GID... | 
					
						
							|  |  |  | 			GID = image_gid('%s.%s' % (os.path.join(*[config['ARCHIVE_ROOT']] + raw[0] + [raw[1]]), raw[2])) | 
					
						
							| 
									
										
										
										
											2012-03-01 18:07:11 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-03-22 14:32:48 +04:00
										 |  |  | 			##!!! normalize the image format... | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 			img = { | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 				'gid': GID, | 
					
						
							|  |  |  | 				'name': name, | 
					
						
							| 
									
										
										
										
											2012-03-02 02:13:55 +04:00
										 |  |  | 				'imported': time.time(), | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 				'updated': time.time(), | 
					
						
							| 
									
										
										
										
											2012-03-03 02:11:19 +04:00
										 |  |  | 				# NOTE: this might get distorted on archiving or | 
					
						
							|  |  |  | 				# 		copying... | 
					
						
							|  |  |  | 				# 		mostly intended for importing... | 
					
						
							| 
									
										
										
										
											2012-03-02 16:17:12 +04:00
										 |  |  | 				'ctime': raw[3],  | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 				##!!! make these more general... | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 				'RAW': [e for e in l if e[2] == RAW], | 
					
						
							| 
									
										
										
										
											2012-03-02 16:17:12 +04:00
										 |  |  | 				'XMP': [e for e in l if e[2] == XMP], | 
					
						
							|  |  |  | 				'JPG': [e for e in l if e[2] == JPEG], | 
					
						
							|  |  |  | 				'PSD': [e for e in l if e[2] == PSD], | 
					
						
							|  |  |  | 				'TIFF': [e for e in l if e[2] == TIFF], | 
					
						
							|  |  |  | 				'other': [e for e in l if e[2] != OR(TIFF, PSD, JPEG, XMP, RAW)], | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 			} | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 			# add new data... | 
					
						
							|  |  |  | 			if GID not in res: | 
					
						
							|  |  |  | 				res[GID] = img | 
					
						
							|  |  |  | 				new_n += 1 | 
					
						
							|  |  |  | 			# update existing... | 
					
						
							|  |  |  | 			else: | 
					
						
							|  |  |  | 				cur = res[GID] | 
					
						
							|  |  |  | 				updating = False | 
					
						
							|  |  |  | 				for k, v in img.iteritems(): | 
					
						
							|  |  |  | 					# skip  | 
					
						
							|  |  |  | 					if k in ('imported', 'name', 'gid', 'ctime', 'updated'): | 
					
						
							|  |  |  | 						continue | 
					
						
							|  |  |  | 					if v != cur[k]: | 
					
						
							|  |  |  | 						cur[k] = v | 
					
						
							|  |  |  | 						updating = True | 
					
						
							|  |  |  | 				# do the actual update... | 
					
						
							|  |  |  | 				if updating: | 
					
						
							|  |  |  | 					cur['updated'] = time.time() | 
					
						
							|  |  |  | 					res[GID] = cur | 
					
						
							|  |  |  | 					up_n += 1 | 
					
						
							|  |  |  | 				else: | 
					
						
							|  |  |  | 					skipped += [GID] | 
					
						
							| 
									
										
										
										
											2012-03-01 18:07:11 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 	return res, failed, skipped | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-15 15:25:03 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | #----------------------------------------------------------------------- | 
					
						
							|  |  |  | if __name__ == '__main__': | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-17 00:44:04 +04:00
										 |  |  | 	INDEX_PATH = config.get('INDEX_ROOT', os.path.join('test', 'index2')) | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-03-19 15:00:29 +04:00
										 |  |  | 	FILE_LIST = os.path.join('test', 'flatfilelist-P7000-new.json') | 
					
						
							| 
									
										
										
										
											2012-03-17 00:44:04 +04:00
										 |  |  | ##	FILE_LIST = os.path.join('test', 'flatfilelist-120kfiles.json') | 
					
						
							|  |  |  | ##	FILE_LIST = os.path.join('test', 'flatfilelist.json') | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 	BUILD_FILE_LIST = False if os.path.exists(FILE_LIST) else True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if BUILD_FILE_LIST: | 
					
						
							|  |  |  | 		lst = list(list_files(config['ARCHIVE_ROOT'])) | 
					
						
							|  |  |  | 	 | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 		print 'found files:', len(lst) | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | ##		pprint(lst[0]) | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 	 | 
					
						
							|  |  |  | 		json.dump(lst, file(FILE_LIST, 'w')) | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 		print 'saved...' | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	lst = json.load(file(FILE_LIST)) | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 	print 'loaded:', len(lst) | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-20 01:39:22 +04:00
										 |  |  | 	IMPORT_DIFF = False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	# skip already read files... | 
					
						
							|  |  |  | 	if IMPORT_DIFF and not BUILD_FILE_LIST: | 
					
						
							|  |  |  | 		lst_cur = list(list_files(config['ARCHIVE_ROOT'])) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		print 'found files:', len(lst_cur) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		lst_cur = [ e for e in lst_cur if e not in lst ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		print 'found new or updated files:', len(lst_cur) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		lst = lst_cur | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		raise SystemExit | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 	index = index_by_name(lst) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | ##	GID_index = store.IndexWithCache(INDEX_PATH) | 
					
						
							|  |  |  | 	GID_index = store.Index(INDEX_PATH) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 	# a cheating waw to say if we are empty... | 
					
						
							|  |  |  | 	index_empty = True | 
					
						
							|  |  |  | 	for k in GID_index.iterkeys(): | 
					
						
							|  |  |  | 		index_empty = False | 
					
						
							|  |  |  | 		break | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t0 = time.time() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if not index_empty: | 
					
						
							|  |  |  | 		print 'updating...' | 
					
						
							|  |  |  | 		##!!! this takes a substantially longer time initially... (about 30x longer) | 
					
						
							|  |  |  | 		GID_index, failed, skipped = gid_index(index, GID_index) | 
					
						
							|  |  |  | 	else: | 
					
						
							|  |  |  | 		print 'indexing...' | 
					
						
							|  |  |  | 		GID_index, failed, skipped = gid_index(index) | 
					
						
							|  |  |  | 		store.dump(GID_index, INDEX_PATH, index_depth=2) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	t1 = time.time() | 
					
						
							| 
									
										
										
										
											2012-03-20 01:39:22 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 	print 'done in:', t1-t0, 'seconds.' | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	json.dump(failed, file(os.path.join('test', 'failed-to-categorise.json'), 'w')) | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-01 18:07:11 +04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	##!!! TODO: archive descriptions to help index/tag items... | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	# NOTE: each import from an existing archive will be as follows: | 
					
						
							|  |  |  | 	# 			- full listing | 
					
						
							|  |  |  | 	# 			- find new subtrees | 
					
						
							|  |  |  | 	# 			- find modified items (file date diff) | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 	 | 
					
						
							| 
									
										
										
										
											2012-03-03 02:11:19 +04:00
										 |  |  | 	# NOTE: raws number here may be more than indexed because some raws  | 
					
						
							|  |  |  | 	# 		may get grouped by GID | 
					
						
							|  |  |  | 	print '''results:
 | 
					
						
							|  |  |  | 	indexed: %s | 
					
						
							|  |  |  | 	raws: %s | 
					
						
							|  |  |  | 	failed: %s | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 	skipped: %s | 
					
						
							| 
									
										
										
										
											2012-03-10 19:16:07 +04:00
										 |  |  | 	''' % (
 | 
					
						
							|  |  |  | 			len(GID_index),  | 
					
						
							|  |  |  | 			len([ e for e in lst if e[2] == RAW]),  | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | 			len(failed), | 
					
						
							|  |  |  | 			len(skipped)) | 
					
						
							| 
									
										
										
										
											2012-02-29 17:10:16 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-03-27 18:13:52 +04:00
										 |  |  | ##	##!!! this is really slow because it pulls ALL the data... wonder who wrote this? :) | 
					
						
							|  |  |  | ##	pprint(GID_index.itervalues().next()) | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-15 15:25:03 +04:00
										 |  |  | ##	store.dump(GID_index, INDEX_PATH) | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-03-17 00:44:04 +04:00
										 |  |  | ##	store.pack(INDEX_PATH) | 
					
						
							| 
									
										
										
										
											2012-03-13 22:48:46 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-02-29 17:04:55 +04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #======================================================================= | 
					
						
							|  |  |  | #                                            vim:set ts=4 sw=4 nowrap : |