import sys # A simple reducer for word counting # Note- you wouldn't actually want to do this for a potentially # unbounded number of tokens. counts = {} for line in sys.stdin: (token, count) = line.split('\t') counts[token] = counts.get(token, 0) + int(count) # The sort command here does it in decreasing frequency order (most frequent # words appear first), and alphabetically within tokens that occur the same # number of times. for (token, count) in sorted(counts.items(), key = lambda x: (-x[1], x[0])): print '%s\t%s' % (token, count)