我的尝试:
def merge(lsts): sets = [set(lst) for lst in lsts if lst] merged = True while merged: merged = False results = [] while sets: common, rest = sets[0], sets[1:] sets = [] for x in rest: if x.isdisjoint(common): sets.append(x) else: merged = True common |= x results.append(common) sets = results return setslst = [[65, 17, 5, 30, 79, 56, 48, 62], [6, 97, 32, 93, 55, 14, 70, 32], [75, 37, 83, 34, 9, 19, 14, 64], [43, 71], [], [89, 49, 1, 30, 28, 3, 63], [35, 21, 68, 94, 57, 94, 9, 3], [16], [29, 9, 97, 43], [17, 63, 24]]print merge(lst)
基准测试:
import random# adapt parameters to your own usage scenarioclass_count = 50class_size = 1000list_count_per_class = 100large_list_sizes = list(range(100, 1000))small_list_sizes = list(range(0, 100))large_list_probability = 0.5if False: # change to true to generate the test data file (takes a while) with open("/tmp/test.txt", "w") as f: lists = [] classes = [ range(class_size * i, class_size * (i + 1)) for i in range(class_count) ] for c in classes: # distribute each class across ~300 lists for i in xrange(list_count_per_class): lst = [] if random.random() < large_list_probability: size = random.choice(large_list_sizes) else: size = random.choice(small_list_sizes) nums = set(c) for j in xrange(size): x = random.choice(list(nums)) lst.append(x) nums.remove(x) random.shuffle(lst) lists.append(lst) random.shuffle(lists) for lst in lists: f.write(" ".join(str(x) for x in lst) + "n")setup = """# Niklas'def merge_niklas(lsts): sets = [set(lst) for lst in lsts if lst] merged = 1 while merged: merged = 0 results = [] while sets: common, rest = sets[0], sets[1:] sets = [] for x in rest: if x.isdisjoint(common): sets.append(x) else: merged = 1 common |= x results.append(common) sets = results return sets# Rik'sdef merge_rik(data): sets = (set(e) for e in data if e) results = [next(sets)] for e_set in sets: to_update = [] for i, res in enumerate(results): if not e_set.isdisjoint(res): to_update.insert(0, i) if not to_update: results.append(e_set) else: last = results[to_update.pop(-1)] for i in to_update: last |= results[i] del results[i] last |= e_set return results# katrielalex'sdef pairs(lst): i = iter(lst) first = prev = item = i.next() for item in i: yield prev, item prev = item yield item, firstimport networkxdef merge_katrielalex(lsts): g = networkx.Graph() for lst in lsts: for edge in pairs(lst): g.add_edge(*edge) return networkx.connected_components(g)# agf's (optimized)from collections import dequedef merge_agf_optimized(lists): sets = deque(set(lst) for lst in lists if lst) results = [] disjoint = 0 current = sets.pop() while True: merged = False newsets = deque() for _ in xrange(disjoint, len(sets)): this = sets.pop() if not current.isdisjoint(this): current.update(this) merged = True disjoint = 0 else: newsets.append(this) disjoint += 1 if sets: newsets.extendleft(sets) if not merged: results.append(current) try: current = newsets.pop() except IndexError: break disjoint = 0 sets = newsets return results# agf's (simple)def merge_agf_simple(lists): newsets, sets = [set(lst) for lst in lists if lst], [] while len(sets) != len(newsets): sets, newsets = newsets, [] for aset in sets: for eachset in newsets: if not aset.isdisjoint(eachset): eachset.update(aset) break else: newsets.append(aset) return newsets# alexis'def merge_alexis(data): bins = range(len(data)) # Initialize each bin[n] == n nums = dict() data = [set(m) for m in data] # Convert to sets for r, row in enumerate(data): for num in row: if num not in nums: # New number: tag it with a pointer to this row's bin nums[num] = r continue else: dest = locatebin(bins, nums[num]) if dest == r: continue # already in the same bin if dest > r: dest, r = r, dest # always merge into the smallest bin data[dest].update(data[r]) data[r] = None # Update our indices to reflect the move bins[r] = dest r = dest # Filter out the empty bins have = [m for m in data if m] return havedef locatebin(bins, n): while bins[n] != n: n = bins[n] return nlsts = []size = 0num = 0max = 0for line in open("/tmp/test.txt", "r"): lst = [int(x) for x in line.split()] size += len(lst) if len(lst) > max: max = len(lst) num += 1 lsts.append(lst)"""setup += """print "%i lists, {class_count} equally distributed classes, average size %i, max size %i" % (num, size/num, max)""".format(class_count=class_count)import timeitprint "niklas"print timeit.timeit("merge_niklas(lsts)", setup=setup, number=3)print "rik"print timeit.timeit("merge_rik(lsts)", setup=setup, number=3)print "katrielalex"print timeit.timeit("merge_katrielalex(lsts)", setup=setup, number=3)print "agf (1)"print timeit.timeit("merge_agf_optimized(lsts)", setup=setup, number=3)print "agf (2)"print timeit.timeit("merge_agf_simple(lsts)", setup=setup, number=3)print "alexis"print timeit.timeit("merge_alexis(lsts)", setup=setup, number=3)
这些时间显然取决于基准测试的特定参数,例如类数,列表数,列表大小等。请根据需要调整这些参数以获得更有用的结果。
以下是我的机器上针对不同参数的一些示例输出。他们表明,所有算法都有其优势和劣势,具体取决于所获得的输入类型:
=====================# many disjoint classes, large listsclass_count = 50class_size = 1000list_count_per_class = 100large_list_sizes = list(range(100, 1000))small_list_sizes = list(range(0, 100))large_list_probability = 0.5=====================niklas5000 lists, 50 equally distributed classes, average size 298, max size 9994.80084705353rik5000 lists, 50 equally distributed classes, average size 298, max size 9999.49251699448katrielalex5000 lists, 50 equally distributed classes, average size 298, max size 99921.5317108631agf (1)5000 lists, 50 equally distributed classes, average size 298, max size 9998.61671280861agf (2)5000 lists, 50 equally distributed classes, average size 298, max size 9995.18117713928=> alexis=> 5000 lists, 50 equally distributed classes, average size 298, max size 999=> 3.73504281044===================# less number of classes, large listsclass_count = 15class_size = 1000list_count_per_class = 300large_list_sizes = list(range(100, 1000))small_list_sizes = list(range(0, 100))large_list_probability = 0.5===================niklas4500 lists, 15 equally distributed classes, average size 296, max size 9991.79993700981rik4500 lists, 15 equally distributed classes, average size 296, max size 9992.58237695694katrielalex4500 lists, 15 equally distributed classes, average size 296, max size 99919.5465381145agf (1)4500 lists, 15 equally distributed classes, average size 296, max size 9992.75445604324=> agf (2)=> 4500 lists, 15 equally distributed classes, average size 296, max size 999=> 1.77850699425alexis4500 lists, 15 equally distributed classes, average size 296, max size 9993.23530197144===================# less number of classes, smaller listsclass_count = 15class_size = 1000list_count_per_class = 300large_list_sizes = list(range(100, 1000))small_list_sizes = list(range(0, 100))large_list_probability = 0.1===================niklas4500 lists, 15 equally distributed classes, average size 95, max size 9970.773697137833rik4500 lists, 15 equally distributed classes, average size 95, max size 9971.0523750782katrielalex4500 lists, 15 equally distributed classes, average size 95, max size 9976.04466891289agf (1)4500 lists, 15 equally distributed classes, average size 95, max size 9971.20285701752=> agf (2)=> 4500 lists, 15 equally distributed classes, average size 95, max size 997=> 0.714507102966alexis4500 lists, 15 equally distributed classes, average size 95, max size 9971.1286110878
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)