split store functionality... still needs work.

Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com>
2025-10-29 10:20:08 +00:00 · 2012-03-13 18:36:53 +04:00 · 2012-03-13 18:36:53 +04:00 · 388215cdf3
commit 388215cdf3
parent d8fd5bbb10
3 changed files with 282 additions and 249 deletions
--- a/gid.py
+++ b/gid.py
@ -1,13 +1,15 @@
 #=======================================================================
 __version__ = '''0.0.01'''
-__sub_version__ = '''20120310183438'''
+__sub_version__ = '''20120313182702'''
 __copyright__ = '''(c) Alex A. Naanou 2011'''
 #-----------------------------------------------------------------------
 import os
 import sha
 import md5
 import pyexiv2 as metadata
@ -20,7 +22,7 @@ import pyexiv2 as metadata
 # XXX not yet sure if this is unique enough to avoid conflicts if one
 # 	  photographer has enough cameras...
 # XXX also might be wise to add a photographer ID into here...
-def image_gid(path, format='%(artist)s-%(date)s-%(name)s', date_format='%Y%m%d-%H%M%S'):
+def image_gid(path, format='%(artist)s-%(date)s-%(name)s', date_format='%Y%m%d-%H%M%S', hash_func=sha.sha):
 	'''
 	Calgulate image GID.
@ -35,8 +37,12 @@ def image_gid(path, format='%(artist)s-%(date)s-%(name)s', date_format='%Y%m%d-%
 	Example:
 		Alex_A.Naanou-20110627-195706-DSC_1234	
 	If hash_func is not None, then the function will be used to henerate 
 	a hex hash from the above string.
 	Supported fields:
-		%(artist)s	- Exif.Image.Artist field, stripped and spaces replaced with underscores.
+		%(artist)s	- Exif.Image.Artist field, stripped and spaces replaced
 					  with underscores.
 		%(date)s	- Exif.Image.DateTime formated to date_format argument.
 		%(name)s	- file name.
@ -57,9 +63,18 @@ def image_gid(path, format='%(artist)s-%(date)s-%(name)s', date_format='%Y%m%d-%
 	if '%(artist)s' in format:
 		data['artist'] = i['Exif.Image.Artist'].value.strip().replace(' ', '_')
 	if hash_func is not None:
 		return hash_func(format % data).hexdigest()
 	return format % data
 #-----------------------------------------------------------------------
 if __name__ == '__main__':
 	pass
 #=======================================================================
 #                                            vim:set ts=4 sw=4 nowrap :
--- a/index.py
+++ b/index.py
@ -1,7 +1,7 @@
 #=======================================================================
 __version__ = '''0.0.01'''
-__sub_version__ = '''20120202193619'''
+__sub_version__ = '''20120313183420'''
 __copyright__ = '''(c) Alex A. Naanou 2011'''
@ -11,12 +11,14 @@ import os
 import json
 import zipfile
 import uuid
 from pprint import pprint
 from itertools import izip, izip_longest
 from pli.logictypes import ANY, OR
-from pprint import pprint
+
 import store
 #-----------------------------------------------------------------------
@ -244,240 +246,6 @@ def split_images(index):
 #-----------------------------------------------------------------------
 # XXX is this a good way to serialize the actual data in the fs???
 # NOTE: these will work with any topoloy and create a flat index...
 def save_file_index(index, path, index_depth=1, ext='.json'):
 	'''
 	NOTE: index_depth with value greater than 2 is an overkill.
 	'''
 	root_index = {}
 	for k, v in index.items():
 		if index_depth > 0:
 			d = []
 			rest = k
 			# build index path...
 			for i in xrange(index_depth):
 				d += [rest[:2]]
 				rest = rest[2:]
 				# recursive directory construction...
 				if not os.path.exists(os.path.join(path, *d)):
 					os.mkdir(os.path.join(path, *d))
 			p = os.path.join(path, *d + [k + ext])
 		else:
 			p = os.path.join(path, k + ext)
 		json.dump(v, file(p, 'w'), indent=4, separators=(', ', ': '))
 		root_index[k] = p
 ##		print '.',
 	return root_index
 def load_file_index(path, ext='.json', pack_ext='.pack'):
 	'''
 	'''
 	d = {}
 	for p, _, files in os.walk(path):
 		for f in files:
 			# handle single files...
 			if f.endswith(ext):
 				d[os.path.splitext(f)[0]] = json.load(file(os.path.join(p, f)))
 			# handle packs...
 			elif f.endswith(pack_ext):
 				pack = zipfile.ZipFile(os.path.join(p, f))
 				# load elements form the pack...
 				for name in pack.namelist():
 					if name.endswith(ext):
 						d[os.path.splitext(name)[0]] = json.loads(pack.read(name))
 	return d
 # XXX should we remove empty dirs here???
 def pack_file_index(path, ext='.json', pack_ext='.pack', keep_files=False, keep_dirs=False):
 	'''
 	NOTE: if keep_files is True, keep_dirs option will be ignored.
 	'''
 	z = zipfile.ZipFile(os.path.join(path, 'index' + pack_ext), 'a', compression=zipfile.ZIP_DEFLATED)
 	for p, _, files in os.walk(path):
 		for f in files: 
 			if f.endswith(ext):
 				z.write(os.path.join(p, f), os.path.split(f)[-1])
 				if not keep_files:
 					os.remove(os.path.join(p, f))
 					# XXX this will not remove empty dirs (push one
 					#     level up for that...)
 					if not keep_dirs and p != path:
 						##!!! check if dir is empty....
 						try:
 							# NOTE: this will fail for non-empty dirs...
 							os.rmdir(os.path.join(p))
 						except:
 							pass
 	z.close()
 ##!!! get path by name helper...
 ##!!!
 #-----------------------------------------------------------------------
 # lazy dict-like objects that read and write (optional) the fs...
 import pli.pattern.mixin.mapping as mapping
 import pli.objutils as objutils
 # XXX might be good to do a path index...
 ##!!! make this archive/file structure-agnostic...
 class Index(mapping.Mapping):
 	__json_ext__ = '.json'
 	__pack_ext__ = '.pack'
 	def __init__(self, path):
 		'''
 		'''
 		self._path = path
 	# specific interface...
 	##!!! make this support different depths...
 	def __locations__(self, name):
 		'''
 		'''
 		ext = self.__json_ext__
 		name += ext
 		# build probable locations...
 		return (
 				name,
 				# look in a directory...
 				os.path.join(name[:2], name),
 				##!!! HACK: make this dynamic...
 				os.path.join(name[:2], name[2:4], name),
 		)
 	# mapping interface...
 	def __getitem__(self, name):
 		'''
 		'''
 ##		ext = self.__json_ext__
 		pack_ext = self.__pack_ext__
 ##		file_name = name + ext
 		locations = self.__locations__(name)
 		# look of the file directly...
 		for n in locations:
 			if os.path.exists(os.path.join(self._path, n)):
 				return json.load(file(os.path.join(self._path, n)))
 		# try and locate a file in a pack...
 		for p, _, files in os.walk(self._path):
 			# files are searched sorted by their name...
 			files.sort()
 			for f in files:
 ##				##!!! do we need to look in odd named directories...
 ##				if f == file_name:
 ##					return json.load(file(os.path.join(p, file_name)))
 				if f.endswith(pack_ext):
 					z = zipfile.ZipFile(os.path.join(p, f))
 					for n in locations:
 						if n in z.namelist():
 							return json.loads(z.read(n))
 		raise KeyError, name
 	def __setitem__(self, name, value):
 		'''
 		'''
 		raise NotImplementedError
 	def __delitem__(self, name):
 		'''
 		'''
 		raise NotImplementedError
 	def __iter__(self):
 		'''
 		'''
 		visited = []
 		packs = []
 		ext = self.__json_ext__
 		pack_ext = self.__pack_ext__
 		for p, _, files in os.walk(self._path):
 			for f in files:
 				if f.endswith(ext) and f not in visited:
 					visited += [f]
 					yield os.path.splitext(f)[0]
 				elif f.endswith(pack_ext):
 					packs += [os.path.join(p, f)]
 		for pack in packs:
 			z = zipfile.ZipFile(pack)
 			for name in z.namelist():
 				if name not in visited:
 					visited += [name]
 					yield os.path.splitext(name)[0]
 REMOVED = object()
 class IndexWithCache(Index):
 	'''
 	'''
 	objutils.createonaccess('_cache', dict)
 	__sync__ = False
 	def __getitem__(self, name):
 		'''
 		'''
 		if name in self._cache:
 			res = self._cache[name]
 			if res is REMOVED:
 				raise KeyError, name
 			return res
 		res = self._cache[name] = super(IndexWithCache, self).__getitem__(name)
 		return res
 	def __setitem__(self, name, value):
 		'''
 		'''
 		self._cache[name] = value
 		if self.__sync__:
 			self.cache_flush(name)
 	##!!!
 	def __delitem__(self, name):
 		'''
 		'''
 		self._cache[name] = REMOVED
 		if self.__sync__:
 			self.cache_flush(name)
 	def __iter__(self):
 		'''
 		'''
 		cache = self._cache
 		for e in cache:
 			yield e
 		for e in super(IndexWithCache, self).__iter__():
 			if e not in cache:
 				yield e
 	# cache management...
 	##!!! removed items will not get flushed yet...
 	# XXX to make removing elements history compatible, one way to go
 	#     is to write a specifc value to the file, thus making it
 	#     shadow the original value...
 	def cache_flush(self, *keys):
 		'''
 		'''
 		if keys == ():
 			return save_file_index(self._cache, self._path)
 		flush = {}
 		for k in keys:
 			if k is REMOVED:
 				# remove file...
 ##				raise NotImplementedError
 				##!!!
 				continue
 			flush[k] = self[k]
 		return save_file_index(flush, self._path)
 	def cache_drop(self):
 		'''
 		'''
 		del self._cache
 #-----------------------------------------------------------------------
 ##!!! test implementation: rewrite...
 import pyexiv2 as metadata
@ -566,7 +334,7 @@ def build_image_cache(ic, min_rating, dest, tmp_path, preview_size=900):
 			continue
 	ic.cache_flush()
-	pack_file_index(ic._path, keep_files=False)
+	store.pack_file_index(ic._path, keep_files=False)
 	return res
@ -594,27 +362,27 @@ if __name__ == '__main__':
-	root_index = save_file_index(index, os.path.join('test', 'index'), index_depth=1)
+	root_index = store.save_file_index(index, os.path.join('test', 'index'), index_depth=1)
 ##	##!!! this is not used in anything yet...
 ##	json.dump(root_index, file(os.path.join('test', 'index', 'file_index.json'), 'w'))
-	pack_file_index(os.path.join('test', 'index'), keep_files=False)
+	store.pack_file_index(os.path.join('test', 'index'), keep_files=False)
-	d = load_file_index(os.path.join('test', 'index'))
+	d = store.load_file_index(os.path.join('test', 'index'))
 	print len(d)
 	k = d.keys()[0]
-	i = Index(os.path.join('test', 'index'))
+	i = store.Index(os.path.join('test', 'index'))
 	print len(i)
 ##	print i[k]
-	ic = IndexWithCache(os.path.join('test', 'index'))
+	ic = store.IndexWithCache(os.path.join('test', 'index'))
 	print ic[k]
@ -622,13 +390,13 @@ if __name__ == '__main__':
 	ic.cache_flush()
-	pack_file_index(ic._path, keep_files=False)
+	store.pack_file_index(ic._path, keep_files=False)
 	ic.__sync__ = True
 	ic['111111111111111111111111111111111'] = {}
-	pack_file_index(ic._path, keep_files=False)
+	store.pack_file_index(ic._path, keep_files=False)
 	##!!! revise...
@ -647,12 +415,12 @@ if __name__ == '__main__':
 		full = dict(json.load(file(os.path.join('test', 'filelist of 20k files.json'))))
 		print 'writing files...'
-		root_index = save_file_index(full, os.path.join('test', 'index'), index_depth=1)
+		root_index = store.save_file_index(full, os.path.join('test', 'index'), index_depth=1)
 		print 'packing files...'
 		# NOTE: the initial archiving seems REALLY SLOW, but working with
 		# 		small numbers of files from the archive seems adequate...
-		pack_file_index(os.path.join('test', 'index'), keep_files=True)
+		store.pack_file_index(os.path.join('test', 'index'), keep_files=True)
--- a/store.py
+++ b/store.py
@ -0,0 +1,250 @@
 #=======================================================================
 __version__ = '''0.0.01'''
 __sub_version__ = '''20120313183119'''
 __copyright__ = '''(c) Alex A. Naanou 2011'''
 #-----------------------------------------------------------------------
 import os
 import json
 import zipfile
 #-----------------------------------------------------------------------
 # XXX is this a good way to serialize the actual data in the fs???
 # NOTE: these will work with any topoloy and create a flat index...
 def save_file_index(index, path, index_depth=1, ext='.json'):
 	'''
 	NOTE: index_depth with value greater than 2 is an overkill.
 	'''
 	root_index = {}
 	for k, v in index.items():
 		if index_depth > 0:
 			d = []
 			rest = k
 			# build index path...
 			for i in xrange(index_depth):
 				d += [rest[:2]]
 				rest = rest[2:]
 				# recursive directory construction...
 				if not os.path.exists(os.path.join(path, *d)):
 					os.mkdir(os.path.join(path, *d))
 			p = os.path.join(path, *d + [k + ext])
 		else:
 			p = os.path.join(path, k + ext)
 		json.dump(v, file(p, 'w'), indent=4, separators=(', ', ': '))
 		root_index[k] = p
 ##		print '.',
 	return root_index
 def load_file_index(path, ext='.json', pack_ext='.pack'):
 	'''
 	'''
 	d = {}
 	for p, _, files in os.walk(path):
 		for f in files:
 			# handle single files...
 			if f.endswith(ext):
 				d[os.path.splitext(f)[0]] = json.load(file(os.path.join(p, f)))
 			# handle packs...
 			elif f.endswith(pack_ext):
 				pack = zipfile.ZipFile(os.path.join(p, f))
 				# load elements form the pack...
 				for name in pack.namelist():
 					if name.endswith(ext):
 						d[os.path.splitext(name)[0]] = json.loads(pack.read(name))
 	return d
 # XXX should we remove empty dirs here???
 def pack_file_index(path, ext='.json', pack_ext='.pack', keep_files=False, keep_dirs=False):
 	'''
 	NOTE: if keep_files is True, keep_dirs option will be ignored.
 	'''
 	z = zipfile.ZipFile(os.path.join(path, 'index' + pack_ext), 'a', compression=zipfile.ZIP_DEFLATED)
 	for p, _, files in os.walk(path):
 		for f in files: 
 			if f.endswith(ext):
 				z.write(os.path.join(p, f), os.path.split(f)[-1])
 				if not keep_files:
 					os.remove(os.path.join(p, f))
 					# XXX this will not remove empty dirs (push one
 					#     level up for that...)
 					if not keep_dirs and p != path:
 						##!!! check if dir is empty....
 						try:
 							# NOTE: this will fail for non-empty dirs...
 							os.rmdir(os.path.join(p))
 						except:
 							pass
 	z.close()
 ##!!! get path by name helper...
 ##!!!
 #-----------------------------------------------------------------------
 # lazy dict-like objects that read and write (optional) the fs...
 import pli.pattern.mixin.mapping as mapping
 import pli.objutils as objutils
 # XXX might be good to do a path index...
 ##!!! make this archive/file structure-agnostic...
 class Index(mapping.Mapping):
 	__json_ext__ = '.json'
 	__pack_ext__ = '.pack'
 	def __init__(self, path):
 		'''
 		'''
 		self._path = path
 	# specific interface...
 	##!!! make this support different depths...
 	def __locations__(self, name):
 		'''
 		'''
 		ext = self.__json_ext__
 		name += ext
 		# build probable locations...
 		return (
 				name,
 				# look in a directory...
 				os.path.join(name[:2], name),
 				##!!! HACK: make this dynamic...
 				os.path.join(name[:2], name[2:4], name),
 		)
 	# mapping interface...
 	def __getitem__(self, name):
 		'''
 		'''
 ##		ext = self.__json_ext__
 		pack_ext = self.__pack_ext__
 ##		file_name = name + ext
 		locations = self.__locations__(name)
 		# look of the file directly...
 		for n in locations:
 			if os.path.exists(os.path.join(self._path, n)):
 				return json.load(file(os.path.join(self._path, n)))
 		# try and locate a file in a pack...
 		for p, _, files in os.walk(self._path):
 			# files are searched sorted by their name...
 			files.sort()
 			for f in files:
 ##				##!!! do we need to look in odd named directories...
 ##				if f == file_name:
 ##					return json.load(file(os.path.join(p, file_name)))
 				if f.endswith(pack_ext):
 					z = zipfile.ZipFile(os.path.join(p, f))
 					for n in locations:
 						if n in z.namelist():
 							return json.loads(z.read(n))
 		raise KeyError, name
 	def __setitem__(self, name, value):
 		'''
 		'''
 		raise NotImplementedError
 	def __delitem__(self, name):
 		'''
 		'''
 		raise NotImplementedError
 	def __iter__(self):
 		'''
 		'''
 		visited = []
 		packs = []
 		ext = self.__json_ext__
 		pack_ext = self.__pack_ext__
 		for p, _, files in os.walk(self._path):
 			for f in files:
 				if f.endswith(ext) and f not in visited:
 					visited += [f]
 					yield os.path.splitext(f)[0]
 				elif f.endswith(pack_ext):
 					packs += [os.path.join(p, f)]
 		for pack in packs:
 			z = zipfile.ZipFile(pack)
 			for name in z.namelist():
 				if name not in visited:
 					visited += [name]
 					yield os.path.splitext(name)[0]
 REMOVED = object()
 class IndexWithCache(Index):
 	'''
 	'''
 	objutils.createonaccess('_cache', dict)
 	__sync__ = False
 	def __getitem__(self, name):
 		'''
 		'''
 		if name in self._cache:
 			res = self._cache[name]
 			if res is REMOVED:
 				raise KeyError, name
 			return res
 		res = self._cache[name] = super(IndexWithCache, self).__getitem__(name)
 		return res
 	def __setitem__(self, name, value):
 		'''
 		'''
 		self._cache[name] = value
 		if self.__sync__:
 			self.cache_flush(name)
 	##!!!
 	def __delitem__(self, name):
 		'''
 		'''
 		self._cache[name] = REMOVED
 		if self.__sync__:
 			self.cache_flush(name)
 	def __iter__(self):
 		'''
 		'''
 		cache = self._cache
 		for e in cache:
 			yield e
 		for e in super(IndexWithCache, self).__iter__():
 			if e not in cache:
 				yield e
 	# cache management...
 	##!!! removed items will not get flushed yet...
 	# XXX to make removing elements history compatible, one way to go
 	#     is to write a specifc value to the file, thus making it
 	#     shadow the original value...
 	def cache_flush(self, *keys):
 		'''
 		'''
 		if keys == ():
 			return save_file_index(self._cache, self._path)
 		flush = {}
 		for k in keys:
 			if k is REMOVED:
 				# remove file...
 ##				raise NotImplementedError
 				##!!!
 				continue
 			flush[k] = self[k]
 		return save_file_index(flush, self._path)
 	def cache_drop(self):
 		'''
 		'''
 		del self._cache
 #=======================================================================
 #                                            vim:set ts=4 sw=4 nowrap :