Huffman tree linear-time construction
From Wikipedia:
If the symbols are sorted by probability, there is a linear-time (\(O(n)\)) method to create a Huffman tree using two queues, the first one containing the initial weights (along with pointers to the associated leaves), and combined weights (along with pointers to the trees) being put in the back of the second queue. This assures that the lowest weight is always kept at the front of one of the two queues:
Start with as many leaves as there are symbols.
Enqueue all leaf nodes into the first queue (by probability in increasing order so that the least likely item is in the head of the queue).
While there is more than one node in the queues:
Dequeue the two nodes with the lowest weight by examining the fronts of both queues.
Create a new internal node, with the two just-removed nodes as children (either node can be either child) and the sum of their weights as the new weight.
Enqueue the new node into the rear of the second queue.
The remaining node is the root node; the tree has now been generated.
Although linear-time given sorted input, in the general case of arbitrary input, using this algorithm requires pre-sorting. Thus, since sorting takes \(O(n \log n)\) time in the general case, both methods have the same overall complexity.
Code
#!/usr/bin/python
'''Construct Huffman codes from input text.'''
import sys
class Node:
def __init__(self, name, val):
self.parent = None
self.left = None
self.right = None
self.name = name
self.val = val
def traverse(root, s, codes):
if root.left is None and root.right is None:
codes[root.name] = s
return
traverse(root.left, s + '0', codes)
traverse(root.right, s + '1', codes)
def huffman(d):
nodes = []
for k in sorted(d.keys(), key=lambda x: d[x]):
nodes.append(Node(k, d[k]))
n = len(nodes)
merges = []
for i in range(n - 1):
weights = []
if len(nodes) >= 2:
w20 = nodes[0].val + nodes[1].val
weights.append(w20)
if len(nodes) >= 1 and len(merges) >= 1:
w11 = nodes[0].val + merges[0].val
weights.append(w11)
if len(merges) >= 2:
w02 = merges[0].val + merges[1].val
weights.append(w02)
wmin = min(weights)
if len(nodes) >= 2 and wmin == w20:
node = Node(nodes[0].name + ',' + nodes[1].name, wmin)
node.left, node.right = nodes[0], nodes[1]
nodes[0].parent = nodes[1].parent = node
nodes.pop(0)
nodes.pop(0)
merges.append(node)
elif len(nodes) >= 1 and len(merges) >= 1 and wmin == w11:
node = Node(nodes[0].name + ',' + merges[0].name, wmin)
node.left, node.right = nodes[0], merges[0]
nodes[0].parent = merges[0].parent = node
nodes.pop(0)
merges.pop(0)
merges.append(node)
elif len(merges) >= 2 and wmin == w02:
node = Node(merges[0].name + ',' + merges[1].name, wmin)
node.left, node.right = merges[0], merges[1]
merges[0].parent = merges[1].parent = node
merges.pop(0)
merges.pop(0)
merges.append(node)
root = merges[0]
codes = {}
traverse(root, '', codes)
return codes
def main():
text = sys.stdin.read()
chars = {}
for c in text:
if c in chars:
chars[c] += 1
else:
chars[c] = 1
for k, v in sorted(huffman(chars).items(), key=lambda e: len(e[1])):
print('{:4}: {}'.format(repr(k), v))
if __name__ == '__main__':
main()