- implemented basic grouping strategy for files with identical names based on path, still not all corner cases covered

- need to fix tests because RAWs are grouped from different locations via GID (correct but the test is now wrong) - need to reorganize the code (index2.py) Signed-off-by: Alex A. Naanou <alex.nanou@gmail.com>
2025-10-29 10:20:08 +00:00 · 2012-03-03 02:11:19 +04:00 · 2012-03-03 02:11:19 +04:00 · c3db4c5724
commit c3db4c5724
parent 75b5629a11
1 changed files with 76 additions and 16 deletions
--- a/index2.py
+++ b/index2.py
@ -1,7 +1,7 @@
 #=======================================================================

 __version__ = '''0.0.01'''
-__sub_version__ = '''20120302161602'''
+__sub_version__ = '''20120303020603'''
 __copyright__ = '''(c) Alex A. Naanou 2011'''


@ -37,6 +37,7 @@ CONFIG_NAME = 'test_config.json'

 config = json.load(open(CONFIG_NAME))

+# XXX move this to a context-dependant module...
 RAW = OR(
 	'NEF', 'nef', 
 	'CRW', 'crw',
@ -111,7 +112,7 @@ def image_gid(path, format='%(artist)s-%(date)s-%(name)s', date_format='%Y%m%d-%
 	NOTE: need EXIF data to generate a GID
 	'''
 	data = {
-		'name': os.path.splitext(os.path.split(path)[-1])[0]
+		'name': os.path.splitext(os.path.split(path)[-1])[0],
 	}
 	# check if we need a date in the id...
 	if '%(date)s' in format:
@ -153,6 +154,23 @@ def list_files(root, sub_trees=SUBTREE_CLASSES, type=ITEM, include_root_path=Fal
 					yield path, name, ext


+def common_len(a, *b):
+	'''
+	'''
+	for i, l in enumerate(izip(*(a,) + b)):
+		if len(set(l)) != 1:
+			return i
+	return len(min(*(a,) + b))
+
+
+##!!! is this meaningless?
+def path_distance(a, b):
+	'''
+	'''
+	return len(a) + len(b) - common_len(a, b)*2
+
+
+


 #-----------------------------------------------------------------------
@ -194,22 +212,58 @@ if __name__ == '__main__':
 	# index via a propper GID...
 	# split similarly named but different files...
 	GID_index = {}
+	failed = []
 	for name, l in index.items():

 		l.sort()

 		raws = [e for e in l if e[2] == RAW] 

-		for raw in raws:
-			if len(raws) > 1:
-				print 'duplicates: %s (%sx)...' % (name, len(raws)),
-				# split the group into c seporate groups...
-				# strategies:
-				# 	- path proximity (distance)
-				# 	- metadata
-				##!!!
-				print 'skipping.'
-				break
+		# handle multiple raw files...
+		if len(raws) > 1:
+			common = common_len(*[ e[0] for e in raws ])
+
+			# NOTE: do not change the order of raws after this point
+			# 		and till the end of the loop...
+			# 		XXX revise if there is a simpler way...
+			sets = [ (r, [r]) for r in raws ]
+
+			for e in l:
+				if e[2] == RAW:
+					continue
+				# check if we are closer to other raws...
+				# NOTE: this depends on stability of order in raws
+				c_index = [(common_len(r[0], e[0]), r, i) for i, r in enumerate(raws)]
+				c, raw, i = max(*c_index)
+				if c_index.count([c, ANY, ANY]) > 1:
+					# a file is at a path junction exactly...
+					print '    !!! can\'t decide where to put %s.%s...' % (e[1], e[2])
+					##!!! try different strategies here...
+					##!!!
+					failed += [e]
+				elif c > common:
+					# found a propper location...
+					s = sets[i][1]
+					s += [e]
+					##!!! for some reason this does not work....
+##					sets[i][1] += [e]
+				else:
+					print '    !!! can\'t decide where to put %s.%s...' % (e[1], e[2])
+					##!!! try different strategies here...
+					##!!!
+					failed += [e]
+		# single raw...
+		elif len(raws) == 1:
+			sets = [(raws[0], l)]
+		# no raw files...
+		else:
+			print 'no raw file found for "%s"...' % os.path.join(name)
+			sets = []
+			##!!! need to report this in a usable way...
+			failed += l
+
+
+		for raw, l in sets:
 			# get file GID...
 			GID = image_gid('%s.%s' % (os.path.join(*[config['ARCHIVE_ROOT']] + raw[0] + [raw[1]]), raw[2]))

@ -217,7 +271,9 @@ if __name__ == '__main__':
 				'gid': GID,
 				'name': name,
 				'imported': time.time(),
-				# NOTE: this might get distorted on archiving...
+				# NOTE: this might get distorted on archiving or
+				# 		copying...
+				# 		mostly intended for importing...
 				'ctime': raw[3], 
 				'RAW': raws,
 				'XMP': [e for e in l if e[2] == XMP],
@ -235,9 +291,13 @@ if __name__ == '__main__':
 	# 			- find new subtrees
 	# 			- find modified items (file date diff)
 	
-
-	print GID
-	print len(GID_index), len([ e for e in lst if e[2] == RAW])
+	# NOTE: raws number here may be more than indexed because some raws 
+	# 		may get grouped by GID
+	print '''results:
+	indexed: %s
+	raws: %s
+	failed: %s
+	''' % (len(GID_index), len([ e for e in lst if e[2] == RAW]), len(failed))

 	pprint(GID_index.values()[0])