mirror of
				https://github.com/flynx/ImageGrid.git
				synced 2025-11-03 04:40:10 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			417 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			417 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
#=======================================================================
 | 
						|
 | 
						|
__version__ = '''0.0.01'''
 | 
						|
__sub_version__ = '''20130326030151'''
 | 
						|
__copyright__ = '''(c) Alex A. Naanou 2011'''
 | 
						|
 | 
						|
 | 
						|
#-----------------------------------------------------------------------
 | 
						|
# The first index.py might be a little too complicated. try and resolve
 | 
						|
# this as so:
 | 
						|
# 	- list all relevant files (RAW, XMP, JPG, PSD, ...)
 | 
						|
# 	- group by path (closeness)
 | 
						|
# 		- deepest common path to contain all files with common name.
 | 
						|
# 		  this will fail if we have different files with same names.
 | 
						|
#
 | 
						|
#-----------------------------------------------------------------------
 | 
						|
 | 
						|
import os
 | 
						|
import json
 | 
						|
import zipfile
 | 
						|
import uuid
 | 
						|
import time
 | 
						|
from pprint import pprint
 | 
						|
from itertools import izip, izip_longest
 | 
						|
 | 
						|
import pyexiv2 as metadata
 | 
						|
import couchdb
 | 
						|
 | 
						|
from pli.logictypes import ANY, OR
 | 
						|
 | 
						|
import store
 | 
						|
from gid import image_gid
 | 
						|
 | 
						|
 | 
						|
#-----------------------------------------------------------------------
 | 
						|
 | 
						|
##CONFIG_NAME = 'hdd9_config.json'
 | 
						|
CONFIG_NAME = 'P7000_config.json'
 | 
						|
##CONFIG_NAME = 'staging_config.json'
 | 
						|
 | 
						|
config = json.load(open(CONFIG_NAME))
 | 
						|
 | 
						|
# XXX move this to a context-dependant module...
 | 
						|
RAW = OR(
 | 
						|
	# Nikon
 | 
						|
	'NEF', 'nef', 
 | 
						|
	# Panasonic/Leica
 | 
						|
	'RW2', 'rw2',
 | 
						|
	# Canon
 | 
						|
	'CRW', 'crw',
 | 
						|
	'CR2', 'cr2',
 | 
						|
	# Sigma
 | 
						|
	'X3F', 'x3f',
 | 
						|
	# Adobe/Leica
 | 
						|
	'DNG', 'dng',
 | 
						|
)
 | 
						|
 | 
						|
JPEG = OR(
 | 
						|
	'JPG', 'jpg', 
 | 
						|
	'JPEG', 'jpeg',
 | 
						|
)
 | 
						|
 | 
						|
PSD = OR(
 | 
						|
	'PSD', 'psd'
 | 
						|
)
 | 
						|
 | 
						|
TIFF = OR(
 | 
						|
	'TIFF', 'tiff', 
 | 
						|
	'TIF', 'tif'
 | 
						|
)
 | 
						|
 | 
						|
XMP = OR(
 | 
						|
	'XMP', 'xmp'
 | 
						|
)
 | 
						|
 | 
						|
ITEM = OR(RAW, JPEG, PSD, TIFF, XMP)
 | 
						|
 | 
						|
TYPES = {
 | 
						|
	'raw': RAW,
 | 
						|
	'jpeg': JPEG,
 | 
						|
	'psd': PSD,
 | 
						|
	'tiff': TIFF,
 | 
						|
	'xmp': XMP,
 | 
						|
}
 | 
						|
 | 
						|
SKIP_DIRS = '.sys2'
 | 
						|
SKIP_MARKER = '.skipindexing'
 | 
						|
 | 
						|
 | 
						|
SUBTREE_CLASSES = {
 | 
						|
	'preview': 'preview', 
 | 
						|
	'preview (RAW)': 'RAW preview', 
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
#-----------------------------------------------------------------------
 | 
						|
 | 
						|
 | 
						|
 | 
						|
#----------------------------------------------------------list_files---
 | 
						|
##!!! we will need to normalize the paths to one single scheme (either relative or absolute)...
 | 
						|
# XXX might need to fetch file data too...
 | 
						|
def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM, 
 | 
						|
		include_root_path=False, include_ctime=True, 
 | 
						|
		skip_marker=SKIP_MARKER, skip_dirs=SKIP_DIRS):
 | 
						|
	'''
 | 
						|
	yields:
 | 
						|
		(<path>, <name>, <ext>[, <ctime>]),
 | 
						|
	'''
 | 
						|
	for orig_path, dirs, files in os.walk(root):
 | 
						|
		# skip dir trees containing skip_filename...
 | 
						|
		if skip_marker in files:
 | 
						|
			del dirs[:]
 | 
						|
			continue
 | 
						|
		# skip dirs...
 | 
						|
		while skip_dirs in dirs:
 | 
						|
			dirs.remove(skip_dirs)
 | 
						|
 | 
						|
		# XXX is this correct...
 | 
						|
		path = orig_path.split(os.path.sep)
 | 
						|
		# remove root from path...
 | 
						|
		if not include_root_path:
 | 
						|
			path = path[len(root.split(os.path.sep)):]
 | 
						|
		# process files...
 | 
						|
		for f in files:
 | 
						|
			name, ext = os.path.splitext(f)
 | 
						|
			# we need the extension wothout the dot...
 | 
						|
			ext = ext[1:]
 | 
						|
			# filter by ext...
 | 
						|
			if ext == type:
 | 
						|
				if include_ctime:
 | 
						|
					t = os.path.getctime(os.path.join(orig_path, f))
 | 
						|
					yield path, name, ext, t
 | 
						|
				else:
 | 
						|
					yield path, name, ext
 | 
						|
 | 
						|
 | 
						|
#----------------------------------------------------------common_len---
 | 
						|
def common_len(a, *b):
 | 
						|
	'''
 | 
						|
	calculate the common path length.
 | 
						|
	'''
 | 
						|
	for i, l in enumerate(izip(*(a,) + b)):
 | 
						|
		if len(set(l)) != 1:
 | 
						|
			return i
 | 
						|
	return len(min(*(a,) + b))
 | 
						|
 | 
						|
 | 
						|
#-------------------------------------------------------path_distance---
 | 
						|
##!!! is this meaningless?
 | 
						|
def path_distance(a, b):
 | 
						|
	'''
 | 
						|
	'''
 | 
						|
	return len(a) + len(b) - common_len(a, b)*2
 | 
						|
 | 
						|
 | 
						|
#-------------------------------------------------------index_by_name---
 | 
						|
def index_by_name(lst):
 | 
						|
	'''
 | 
						|
	index by file name (indexing preparation)...
 | 
						|
 | 
						|
	format:
 | 
						|
	{
 | 
						|
		<name> : [
 | 
						|
			(<path>, <name>, ...),
 | 
						|
			...
 | 
						|
		],
 | 
						|
		...
 | 
						|
	}
 | 
						|
	'''
 | 
						|
	res = {}
 | 
						|
	# NOTE: this is to avoid side-effects...
 | 
						|
	lst = lst[:]
 | 
						|
	# sort via name, ext, path
 | 
						|
	lst.sort(key=lambda e: (e[1], e[2], e[0]))
 | 
						|
	for e in lst:
 | 
						|
		n = e[1]
 | 
						|
		if n in res:
 | 
						|
			res[n] += [e]
 | 
						|
		else:
 | 
						|
			res[n] = [e]
 | 
						|
	return res
 | 
						|
 | 
						|
 | 
						|
 | 
						|
#-------------------------------------------------------split_by_raws---
 | 
						|
def split_by_raws(raws, lst, failed):
 | 
						|
	'''
 | 
						|
	'''
 | 
						|
##	raws = [e for e in lst if e[2] == RAW] 
 | 
						|
	# top level common path...
 | 
						|
	common = common_len(*[ e[0] for e in raws ])
 | 
						|
 | 
						|
	# NOTE: do not change the order of raws after this point
 | 
						|
	# 		and till the end of the loop...
 | 
						|
	# 		XXX revise if there is a simpler way...
 | 
						|
	sets = [ [r, [r]] for r in raws ]
 | 
						|
 | 
						|
	for e in lst:
 | 
						|
		if e[2] == RAW:
 | 
						|
			continue
 | 
						|
		# check if we are closer to other raws...
 | 
						|
		# NOTE: this depends on stability of order in raws
 | 
						|
		c_index = [(common_len(r[0], e[0]), r, i) for i, r in enumerate(raws)]
 | 
						|
		c, raw, i = max(*c_index)
 | 
						|
		# we have two locations with identical weight...
 | 
						|
		if c_index.count([c, ANY, ANY]) > 1:
 | 
						|
			# a file is at a path junction exactly...
 | 
						|
			print '    !!! can\'t decide where to put %s.%s...' % (e[1], e[2])
 | 
						|
			##!!! try different strategies here...
 | 
						|
			##!!!
 | 
						|
			failed += [e]
 | 
						|
		# found a location...
 | 
						|
		elif c > common:
 | 
						|
			sets[i][1] += [e]
 | 
						|
		# file in an odd location ##!!! list these locations...
 | 
						|
		else:
 | 
						|
			print '    !!! can\'t decide where to put %s.%s...' % (e[1], e[2])
 | 
						|
			##!!! try different strategies here...
 | 
						|
			##!!!
 | 
						|
			failed += [e]
 | 
						|
	return sets
 | 
						|
 | 
						|
 | 
						|
#-----------------------------------------------------------gid_index---
 | 
						|
##!!! this will rewrite existing data -- should only update...
 | 
						|
def gid_index(index, existing=None):
 | 
						|
	'''
 | 
						|
	'''
 | 
						|
	skipped = []
 | 
						|
	# index via a propper GID...
 | 
						|
	# split similarly named but different files...
 | 
						|
	if existing is None:
 | 
						|
		res = {}
 | 
						|
	else:
 | 
						|
		res = existing
 | 
						|
	failed = []
 | 
						|
	im_n = 0
 | 
						|
	up_n = 0
 | 
						|
	new_n = 0
 | 
						|
 | 
						|
	for name, l in index.iteritems():
 | 
						|
		l.sort()
 | 
						|
		raws = [e for e in l if e[2] == RAW] 
 | 
						|
 | 
						|
		# multiple raw files...
 | 
						|
		if len(raws) > 1:
 | 
						|
			sets = split_by_raws(raws, l, failed)
 | 
						|
		# single raw...
 | 
						|
		elif len(raws) == 1:
 | 
						|
			sets = [(raws[0], l)]
 | 
						|
		# no raw files...
 | 
						|
		else:
 | 
						|
			print (' '*78), '\rno raw file found for "%s"...' % os.path.join(name)
 | 
						|
			sets = []
 | 
						|
			##!!! need to report this in a usable way...
 | 
						|
			failed += l
 | 
						|
 | 
						|
		# add actual elements to index...
 | 
						|
		for raw, l in sets:
 | 
						|
			im_n += 1
 | 
						|
			print 'Processing image:', im_n, 'new:', new_n, 'updated:', up_n, '\r',
 | 
						|
 | 
						|
			# get file GID...
 | 
						|
			GID = image_gid('%s.%s' % (os.path.join(*[config['ARCHIVE_ROOT']] + raw[0] + [raw[1]]), raw[2]))
 | 
						|
 | 
						|
			##!!! normalize the image format...
 | 
						|
			img = {
 | 
						|
				'gid': GID,
 | 
						|
				'name': name,
 | 
						|
				'imported': time.time(),
 | 
						|
				'updated': time.time(),
 | 
						|
				# NOTE: this might get distorted on archiving or
 | 
						|
				# 		copying...
 | 
						|
				# 		mostly intended for importing...
 | 
						|
				'ctime': raw[3], 
 | 
						|
				##!!! make these more general...
 | 
						|
				'RAW': [e for e in l if e[2] == RAW],
 | 
						|
				'XMP': [e for e in l if e[2] == XMP],
 | 
						|
				'JPG': [e for e in l if e[2] == JPEG],
 | 
						|
				'PSD': [e for e in l if e[2] == PSD],
 | 
						|
				'TIFF': [e for e in l if e[2] == TIFF],
 | 
						|
				'other': [e for e in l if e[2] != OR(TIFF, PSD, JPEG, XMP, RAW)],
 | 
						|
			}
 | 
						|
			# add new data...
 | 
						|
			if GID not in res:
 | 
						|
				res[GID] = img
 | 
						|
				new_n += 1
 | 
						|
			# update existing...
 | 
						|
			else:
 | 
						|
				cur = res[GID]
 | 
						|
				updating = False
 | 
						|
				for k, v in img.iteritems():
 | 
						|
					# skip 
 | 
						|
					if k in ('imported', 'name', 'gid', 'ctime', 'updated'):
 | 
						|
						continue
 | 
						|
					if v != cur[k]:
 | 
						|
						cur[k] = v
 | 
						|
						updating = True
 | 
						|
				# do the actual update...
 | 
						|
				if updating:
 | 
						|
					cur['updated'] = time.time()
 | 
						|
					res[GID] = cur
 | 
						|
					up_n += 1
 | 
						|
				else:
 | 
						|
					skipped += [GID]
 | 
						|
 | 
						|
	return res, failed, skipped
 | 
						|
 | 
						|
 | 
						|
 | 
						|
#-----------------------------------------------------------------------
 | 
						|
if __name__ == '__main__':
 | 
						|
 | 
						|
	INDEX_PATH = config.get('INDEX_ROOT', os.path.join('test', 'index2'))
 | 
						|
 | 
						|
	FILE_LIST = os.path.join('test', 'flatfilelist-P7000-new.json')
 | 
						|
##	FILE_LIST = os.path.join('test', 'flatfilelist-120kfiles.json')
 | 
						|
##	FILE_LIST = os.path.join('test', 'flatfilelist.json')
 | 
						|
	BUILD_FILE_LIST = False if os.path.exists(FILE_LIST) else True
 | 
						|
 | 
						|
 | 
						|
	if BUILD_FILE_LIST:
 | 
						|
		lst = list(list_files(config['ARCHIVE_ROOT']))
 | 
						|
	
 | 
						|
		print 'found files:', len(lst)
 | 
						|
##		pprint(lst[0])
 | 
						|
	
 | 
						|
		json.dump(lst, file(FILE_LIST, 'w'))
 | 
						|
		print 'saved...'
 | 
						|
 | 
						|
	lst = json.load(file(FILE_LIST))
 | 
						|
	print 'loaded:', len(lst)
 | 
						|
 | 
						|
 | 
						|
	IMPORT_DIFF = False
 | 
						|
 | 
						|
	# skip already read files...
 | 
						|
	if IMPORT_DIFF and not BUILD_FILE_LIST:
 | 
						|
		lst_cur = list(list_files(config['ARCHIVE_ROOT']))
 | 
						|
 | 
						|
		print 'found files:', len(lst_cur)
 | 
						|
 | 
						|
		lst_cur = [ e for e in lst_cur if e not in lst ]
 | 
						|
 | 
						|
		print 'found new or updated files:', len(lst_cur)
 | 
						|
 | 
						|
		lst = lst_cur
 | 
						|
 | 
						|
		raise SystemExit
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	index = index_by_name(lst)
 | 
						|
 | 
						|
 | 
						|
##	GID_index = store.IndexWithCache(INDEX_PATH)
 | 
						|
	GID_index = store.Index(INDEX_PATH)
 | 
						|
 | 
						|
	# a cheating waw to say if we are empty...
 | 
						|
	index_empty = True
 | 
						|
	for k in GID_index.iterkeys():
 | 
						|
		index_empty = False
 | 
						|
		break
 | 
						|
 | 
						|
	t0 = time.time()
 | 
						|
 | 
						|
	if not index_empty:
 | 
						|
		print 'updating...'
 | 
						|
		##!!! this takes a substantially longer time initially... (about 30x longer)
 | 
						|
		GID_index, failed, skipped = gid_index(index, GID_index)
 | 
						|
	else:
 | 
						|
		print 'indexing...'
 | 
						|
		GID_index, failed, skipped = gid_index(index)
 | 
						|
		store.dump(GID_index, INDEX_PATH, index_depth=2)
 | 
						|
 | 
						|
	t1 = time.time()
 | 
						|
 | 
						|
	print 'done in:', t1-t0, 'seconds.'
 | 
						|
 | 
						|
	json.dump(failed, file(os.path.join('test', 'failed-to-categorise.json'), 'w'))
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	##!!! TODO: archive descriptions to help index/tag items...
 | 
						|
 | 
						|
	# NOTE: each import from an existing archive will be as follows:
 | 
						|
	# 			- full listing
 | 
						|
	# 			- find new subtrees
 | 
						|
	# 			- find modified items (file date diff)
 | 
						|
	
 | 
						|
	# NOTE: raws number here may be more than indexed because some raws 
 | 
						|
	# 		may get grouped by GID
 | 
						|
	print '''results:
 | 
						|
	indexed: %s
 | 
						|
	raws: %s
 | 
						|
	failed: %s
 | 
						|
	skipped: %s
 | 
						|
	''' % (
 | 
						|
			len(GID_index), 
 | 
						|
			len([ e for e in lst if e[2] == RAW]), 
 | 
						|
			len(failed),
 | 
						|
			len(skipped))
 | 
						|
 | 
						|
##	##!!! this is really slow because it pulls ALL the data... wonder who wrote this? :)
 | 
						|
##	pprint(GID_index.itervalues().next())
 | 
						|
 | 
						|
##	store.dump(GID_index, INDEX_PATH)
 | 
						|
 | 
						|
##	store.pack(INDEX_PATH)
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
#=======================================================================
 | 
						|
#                                            vim:set ts=4 sw=4 nowrap :
 |