split store functionality... still needs work.

Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com>
This commit is contained in:
Alex A. Naanou 2012-03-13 18:36:53 +04:00
parent d8fd5bbb10
commit 388215cdf3
3 changed files with 282 additions and 249 deletions

21
gid.py
View File

@ -1,13 +1,15 @@
#======================================================================= #=======================================================================
__version__ = '''0.0.01''' __version__ = '''0.0.01'''
__sub_version__ = '''20120310183438''' __sub_version__ = '''20120313182702'''
__copyright__ = '''(c) Alex A. Naanou 2011''' __copyright__ = '''(c) Alex A. Naanou 2011'''
#----------------------------------------------------------------------- #-----------------------------------------------------------------------
import os import os
import sha
import md5
import pyexiv2 as metadata import pyexiv2 as metadata
@ -20,7 +22,7 @@ import pyexiv2 as metadata
# XXX not yet sure if this is unique enough to avoid conflicts if one # XXX not yet sure if this is unique enough to avoid conflicts if one
# photographer has enough cameras... # photographer has enough cameras...
# XXX also might be wise to add a photographer ID into here... # XXX also might be wise to add a photographer ID into here...
def image_gid(path, format='%(artist)s-%(date)s-%(name)s', date_format='%Y%m%d-%H%M%S'): def image_gid(path, format='%(artist)s-%(date)s-%(name)s', date_format='%Y%m%d-%H%M%S', hash_func=sha.sha):
''' '''
Calgulate image GID. Calgulate image GID.
@ -35,8 +37,12 @@ def image_gid(path, format='%(artist)s-%(date)s-%(name)s', date_format='%Y%m%d-%
Example: Example:
Alex_A.Naanou-20110627-195706-DSC_1234 Alex_A.Naanou-20110627-195706-DSC_1234
If hash_func is not None, then the function will be used to henerate
a hex hash from the above string.
Supported fields: Supported fields:
%(artist)s - Exif.Image.Artist field, stripped and spaces replaced with underscores. %(artist)s - Exif.Image.Artist field, stripped and spaces replaced
with underscores.
%(date)s - Exif.Image.DateTime formated to date_format argument. %(date)s - Exif.Image.DateTime formated to date_format argument.
%(name)s - file name. %(name)s - file name.
@ -57,9 +63,18 @@ def image_gid(path, format='%(artist)s-%(date)s-%(name)s', date_format='%Y%m%d-%
if '%(artist)s' in format: if '%(artist)s' in format:
data['artist'] = i['Exif.Image.Artist'].value.strip().replace(' ', '_') data['artist'] = i['Exif.Image.Artist'].value.strip().replace(' ', '_')
if hash_func is not None:
return hash_func(format % data).hexdigest()
return format % data return format % data
#-----------------------------------------------------------------------
if __name__ == '__main__':
pass
#======================================================================= #=======================================================================
# vim:set ts=4 sw=4 nowrap : # vim:set ts=4 sw=4 nowrap :

260
index.py
View File

@ -1,7 +1,7 @@
#======================================================================= #=======================================================================
__version__ = '''0.0.01''' __version__ = '''0.0.01'''
__sub_version__ = '''20120202193619''' __sub_version__ = '''20120313183420'''
__copyright__ = '''(c) Alex A. Naanou 2011''' __copyright__ = '''(c) Alex A. Naanou 2011'''
@ -11,12 +11,14 @@ import os
import json import json
import zipfile import zipfile
import uuid import uuid
from pprint import pprint
from itertools import izip, izip_longest from itertools import izip, izip_longest
from pli.logictypes import ANY, OR from pli.logictypes import ANY, OR
from pprint import pprint
import store
#----------------------------------------------------------------------- #-----------------------------------------------------------------------
@ -244,240 +246,6 @@ def split_images(index):
#-----------------------------------------------------------------------
# XXX is this a good way to serialize the actual data in the fs???
# NOTE: these will work with any topoloy and create a flat index...
def save_file_index(index, path, index_depth=1, ext='.json'):
'''
NOTE: index_depth with value greater than 2 is an overkill.
'''
root_index = {}
for k, v in index.items():
if index_depth > 0:
d = []
rest = k
# build index path...
for i in xrange(index_depth):
d += [rest[:2]]
rest = rest[2:]
# recursive directory construction...
if not os.path.exists(os.path.join(path, *d)):
os.mkdir(os.path.join(path, *d))
p = os.path.join(path, *d + [k + ext])
else:
p = os.path.join(path, k + ext)
json.dump(v, file(p, 'w'), indent=4, separators=(', ', ': '))
root_index[k] = p
## print '.',
return root_index
def load_file_index(path, ext='.json', pack_ext='.pack'):
'''
'''
d = {}
for p, _, files in os.walk(path):
for f in files:
# handle single files...
if f.endswith(ext):
d[os.path.splitext(f)[0]] = json.load(file(os.path.join(p, f)))
# handle packs...
elif f.endswith(pack_ext):
pack = zipfile.ZipFile(os.path.join(p, f))
# load elements form the pack...
for name in pack.namelist():
if name.endswith(ext):
d[os.path.splitext(name)[0]] = json.loads(pack.read(name))
return d
# XXX should we remove empty dirs here???
def pack_file_index(path, ext='.json', pack_ext='.pack', keep_files=False, keep_dirs=False):
'''
NOTE: if keep_files is True, keep_dirs option will be ignored.
'''
z = zipfile.ZipFile(os.path.join(path, 'index' + pack_ext), 'a', compression=zipfile.ZIP_DEFLATED)
for p, _, files in os.walk(path):
for f in files:
if f.endswith(ext):
z.write(os.path.join(p, f), os.path.split(f)[-1])
if not keep_files:
os.remove(os.path.join(p, f))
# XXX this will not remove empty dirs (push one
# level up for that...)
if not keep_dirs and p != path:
##!!! check if dir is empty....
try:
# NOTE: this will fail for non-empty dirs...
os.rmdir(os.path.join(p))
except:
pass
z.close()
##!!! get path by name helper...
##!!!
#-----------------------------------------------------------------------
# lazy dict-like objects that read and write (optional) the fs...
import pli.pattern.mixin.mapping as mapping
import pli.objutils as objutils
# XXX might be good to do a path index...
##!!! make this archive/file structure-agnostic...
class Index(mapping.Mapping):
__json_ext__ = '.json'
__pack_ext__ = '.pack'
def __init__(self, path):
'''
'''
self._path = path
# specific interface...
##!!! make this support different depths...
def __locations__(self, name):
'''
'''
ext = self.__json_ext__
name += ext
# build probable locations...
return (
name,
# look in a directory...
os.path.join(name[:2], name),
##!!! HACK: make this dynamic...
os.path.join(name[:2], name[2:4], name),
)
# mapping interface...
def __getitem__(self, name):
'''
'''
## ext = self.__json_ext__
pack_ext = self.__pack_ext__
## file_name = name + ext
locations = self.__locations__(name)
# look of the file directly...
for n in locations:
if os.path.exists(os.path.join(self._path, n)):
return json.load(file(os.path.join(self._path, n)))
# try and locate a file in a pack...
for p, _, files in os.walk(self._path):
# files are searched sorted by their name...
files.sort()
for f in files:
## ##!!! do we need to look in odd named directories...
## if f == file_name:
## return json.load(file(os.path.join(p, file_name)))
if f.endswith(pack_ext):
z = zipfile.ZipFile(os.path.join(p, f))
for n in locations:
if n in z.namelist():
return json.loads(z.read(n))
raise KeyError, name
def __setitem__(self, name, value):
'''
'''
raise NotImplementedError
def __delitem__(self, name):
'''
'''
raise NotImplementedError
def __iter__(self):
'''
'''
visited = []
packs = []
ext = self.__json_ext__
pack_ext = self.__pack_ext__
for p, _, files in os.walk(self._path):
for f in files:
if f.endswith(ext) and f not in visited:
visited += [f]
yield os.path.splitext(f)[0]
elif f.endswith(pack_ext):
packs += [os.path.join(p, f)]
for pack in packs:
z = zipfile.ZipFile(pack)
for name in z.namelist():
if name not in visited:
visited += [name]
yield os.path.splitext(name)[0]
REMOVED = object()
class IndexWithCache(Index):
'''
'''
objutils.createonaccess('_cache', dict)
__sync__ = False
def __getitem__(self, name):
'''
'''
if name in self._cache:
res = self._cache[name]
if res is REMOVED:
raise KeyError, name
return res
res = self._cache[name] = super(IndexWithCache, self).__getitem__(name)
return res
def __setitem__(self, name, value):
'''
'''
self._cache[name] = value
if self.__sync__:
self.cache_flush(name)
##!!!
def __delitem__(self, name):
'''
'''
self._cache[name] = REMOVED
if self.__sync__:
self.cache_flush(name)
def __iter__(self):
'''
'''
cache = self._cache
for e in cache:
yield e
for e in super(IndexWithCache, self).__iter__():
if e not in cache:
yield e
# cache management...
##!!! removed items will not get flushed yet...
# XXX to make removing elements history compatible, one way to go
# is to write a specifc value to the file, thus making it
# shadow the original value...
def cache_flush(self, *keys):
'''
'''
if keys == ():
return save_file_index(self._cache, self._path)
flush = {}
for k in keys:
if k is REMOVED:
# remove file...
## raise NotImplementedError
##!!!
continue
flush[k] = self[k]
return save_file_index(flush, self._path)
def cache_drop(self):
'''
'''
del self._cache
#----------------------------------------------------------------------- #-----------------------------------------------------------------------
##!!! test implementation: rewrite... ##!!! test implementation: rewrite...
import pyexiv2 as metadata import pyexiv2 as metadata
@ -566,7 +334,7 @@ def build_image_cache(ic, min_rating, dest, tmp_path, preview_size=900):
continue continue
ic.cache_flush() ic.cache_flush()
pack_file_index(ic._path, keep_files=False) store.pack_file_index(ic._path, keep_files=False)
return res return res
@ -594,27 +362,27 @@ if __name__ == '__main__':
root_index = save_file_index(index, os.path.join('test', 'index'), index_depth=1) root_index = store.save_file_index(index, os.path.join('test', 'index'), index_depth=1)
## ##!!! this is not used in anything yet... ## ##!!! this is not used in anything yet...
## json.dump(root_index, file(os.path.join('test', 'index', 'file_index.json'), 'w')) ## json.dump(root_index, file(os.path.join('test', 'index', 'file_index.json'), 'w'))
pack_file_index(os.path.join('test', 'index'), keep_files=False) store.pack_file_index(os.path.join('test', 'index'), keep_files=False)
d = load_file_index(os.path.join('test', 'index')) d = store.load_file_index(os.path.join('test', 'index'))
print len(d) print len(d)
k = d.keys()[0] k = d.keys()[0]
i = Index(os.path.join('test', 'index')) i = store.Index(os.path.join('test', 'index'))
print len(i) print len(i)
## print i[k] ## print i[k]
ic = IndexWithCache(os.path.join('test', 'index')) ic = store.IndexWithCache(os.path.join('test', 'index'))
print ic[k] print ic[k]
@ -622,13 +390,13 @@ if __name__ == '__main__':
ic.cache_flush() ic.cache_flush()
pack_file_index(ic._path, keep_files=False) store.pack_file_index(ic._path, keep_files=False)
ic.__sync__ = True ic.__sync__ = True
ic['111111111111111111111111111111111'] = {} ic['111111111111111111111111111111111'] = {}
pack_file_index(ic._path, keep_files=False) store.pack_file_index(ic._path, keep_files=False)
##!!! revise... ##!!! revise...
@ -647,12 +415,12 @@ if __name__ == '__main__':
full = dict(json.load(file(os.path.join('test', 'filelist of 20k files.json')))) full = dict(json.load(file(os.path.join('test', 'filelist of 20k files.json'))))
print 'writing files...' print 'writing files...'
root_index = save_file_index(full, os.path.join('test', 'index'), index_depth=1) root_index = store.save_file_index(full, os.path.join('test', 'index'), index_depth=1)
print 'packing files...' print 'packing files...'
# NOTE: the initial archiving seems REALLY SLOW, but working with # NOTE: the initial archiving seems REALLY SLOW, but working with
# small numbers of files from the archive seems adequate... # small numbers of files from the archive seems adequate...
pack_file_index(os.path.join('test', 'index'), keep_files=True) store.pack_file_index(os.path.join('test', 'index'), keep_files=True)

250
store.py Executable file
View File

@ -0,0 +1,250 @@
#=======================================================================
__version__ = '''0.0.01'''
__sub_version__ = '''20120313183119'''
__copyright__ = '''(c) Alex A. Naanou 2011'''
#-----------------------------------------------------------------------
import os
import json
import zipfile
#-----------------------------------------------------------------------
# XXX is this a good way to serialize the actual data in the fs???
# NOTE: these will work with any topoloy and create a flat index...
def save_file_index(index, path, index_depth=1, ext='.json'):
'''
NOTE: index_depth with value greater than 2 is an overkill.
'''
root_index = {}
for k, v in index.items():
if index_depth > 0:
d = []
rest = k
# build index path...
for i in xrange(index_depth):
d += [rest[:2]]
rest = rest[2:]
# recursive directory construction...
if not os.path.exists(os.path.join(path, *d)):
os.mkdir(os.path.join(path, *d))
p = os.path.join(path, *d + [k + ext])
else:
p = os.path.join(path, k + ext)
json.dump(v, file(p, 'w'), indent=4, separators=(', ', ': '))
root_index[k] = p
## print '.',
return root_index
def load_file_index(path, ext='.json', pack_ext='.pack'):
'''
'''
d = {}
for p, _, files in os.walk(path):
for f in files:
# handle single files...
if f.endswith(ext):
d[os.path.splitext(f)[0]] = json.load(file(os.path.join(p, f)))
# handle packs...
elif f.endswith(pack_ext):
pack = zipfile.ZipFile(os.path.join(p, f))
# load elements form the pack...
for name in pack.namelist():
if name.endswith(ext):
d[os.path.splitext(name)[0]] = json.loads(pack.read(name))
return d
# XXX should we remove empty dirs here???
def pack_file_index(path, ext='.json', pack_ext='.pack', keep_files=False, keep_dirs=False):
'''
NOTE: if keep_files is True, keep_dirs option will be ignored.
'''
z = zipfile.ZipFile(os.path.join(path, 'index' + pack_ext), 'a', compression=zipfile.ZIP_DEFLATED)
for p, _, files in os.walk(path):
for f in files:
if f.endswith(ext):
z.write(os.path.join(p, f), os.path.split(f)[-1])
if not keep_files:
os.remove(os.path.join(p, f))
# XXX this will not remove empty dirs (push one
# level up for that...)
if not keep_dirs and p != path:
##!!! check if dir is empty....
try:
# NOTE: this will fail for non-empty dirs...
os.rmdir(os.path.join(p))
except:
pass
z.close()
##!!! get path by name helper...
##!!!
#-----------------------------------------------------------------------
# lazy dict-like objects that read and write (optional) the fs...
import pli.pattern.mixin.mapping as mapping
import pli.objutils as objutils
# XXX might be good to do a path index...
##!!! make this archive/file structure-agnostic...
class Index(mapping.Mapping):
__json_ext__ = '.json'
__pack_ext__ = '.pack'
def __init__(self, path):
'''
'''
self._path = path
# specific interface...
##!!! make this support different depths...
def __locations__(self, name):
'''
'''
ext = self.__json_ext__
name += ext
# build probable locations...
return (
name,
# look in a directory...
os.path.join(name[:2], name),
##!!! HACK: make this dynamic...
os.path.join(name[:2], name[2:4], name),
)
# mapping interface...
def __getitem__(self, name):
'''
'''
## ext = self.__json_ext__
pack_ext = self.__pack_ext__
## file_name = name + ext
locations = self.__locations__(name)
# look of the file directly...
for n in locations:
if os.path.exists(os.path.join(self._path, n)):
return json.load(file(os.path.join(self._path, n)))
# try and locate a file in a pack...
for p, _, files in os.walk(self._path):
# files are searched sorted by their name...
files.sort()
for f in files:
## ##!!! do we need to look in odd named directories...
## if f == file_name:
## return json.load(file(os.path.join(p, file_name)))
if f.endswith(pack_ext):
z = zipfile.ZipFile(os.path.join(p, f))
for n in locations:
if n in z.namelist():
return json.loads(z.read(n))
raise KeyError, name
def __setitem__(self, name, value):
'''
'''
raise NotImplementedError
def __delitem__(self, name):
'''
'''
raise NotImplementedError
def __iter__(self):
'''
'''
visited = []
packs = []
ext = self.__json_ext__
pack_ext = self.__pack_ext__
for p, _, files in os.walk(self._path):
for f in files:
if f.endswith(ext) and f not in visited:
visited += [f]
yield os.path.splitext(f)[0]
elif f.endswith(pack_ext):
packs += [os.path.join(p, f)]
for pack in packs:
z = zipfile.ZipFile(pack)
for name in z.namelist():
if name not in visited:
visited += [name]
yield os.path.splitext(name)[0]
REMOVED = object()
class IndexWithCache(Index):
'''
'''
objutils.createonaccess('_cache', dict)
__sync__ = False
def __getitem__(self, name):
'''
'''
if name in self._cache:
res = self._cache[name]
if res is REMOVED:
raise KeyError, name
return res
res = self._cache[name] = super(IndexWithCache, self).__getitem__(name)
return res
def __setitem__(self, name, value):
'''
'''
self._cache[name] = value
if self.__sync__:
self.cache_flush(name)
##!!!
def __delitem__(self, name):
'''
'''
self._cache[name] = REMOVED
if self.__sync__:
self.cache_flush(name)
def __iter__(self):
'''
'''
cache = self._cache
for e in cache:
yield e
for e in super(IndexWithCache, self).__iter__():
if e not in cache:
yield e
# cache management...
##!!! removed items will not get flushed yet...
# XXX to make removing elements history compatible, one way to go
# is to write a specifc value to the file, thus making it
# shadow the original value...
def cache_flush(self, *keys):
'''
'''
if keys == ():
return save_file_index(self._cache, self._path)
flush = {}
for k in keys:
if k is REMOVED:
# remove file...
## raise NotImplementedError
##!!!
continue
flush[k] = self[k]
return save_file_index(flush, self._path)
def cache_drop(self):
'''
'''
del self._cache
#=======================================================================
# vim:set ts=4 sw=4 nowrap :