From Wikipedia:

If the symbols are sorted by probability, there is a linear-time (\(O(n)\)) method to create a Huffman tree using two queues, the first one containing the initial weights (along with pointers to the associated leaves), and combined weights (along with pointers to the trees) being put in the back of the second queue. This assures that the lowest weight is always kept at the front of one of the two queues:

  1. Start with as many leaves as there are symbols.

  2. Enqueue all leaf nodes into the first queue (by probability in increasing order so that the least likely item is in the head of the queue).

  3. While there is more than one node in the queues:

    1. Dequeue the two nodes with the lowest weight by examining the fronts of both queues.

    2. Create a new internal node, with the two just-removed nodes as children (either node can be either child) and the sum of their weights as the new weight.

    3. Enqueue the new node into the rear of the second queue.

  4. The remaining node is the root node; the tree has now been generated.

Although linear-time given sorted input, in the general case of arbitrary input, using this algorithm requires pre-sorting. Thus, since sorting takes \(O(n \log n)\) time in the general case, both methods have the same overall complexity.

Code

#!/usr/bin/python

'''Construct Huffman codes from input text.'''

import sys

class Node:
    def __init__(self, name, val):
        self.parent = None
        self.left = None
        self.right = None
        self.name = name
        self.val = val

def traverse(root, s, codes):
    if root.left is None and root.right is None:
        codes[root.name] = s
        return

    traverse(root.left, s + '0', codes)
    traverse(root.right, s + '1', codes)

def huffman(d):
    nodes = []
    for k in sorted(d.keys(), key=lambda x: d[x]):
        nodes.append(Node(k, d[k]))
    n = len(nodes)

    merges = []
    for i in range(n - 1):
        weights = []
        if len(nodes) >= 2:
            w20 = nodes[0].val + nodes[1].val
            weights.append(w20)
        if len(nodes) >= 1 and len(merges) >= 1:
            w11 = nodes[0].val + merges[0].val
            weights.append(w11)
        if len(merges) >= 2:
            w02 = merges[0].val + merges[1].val
            weights.append(w02)
        wmin = min(weights)
        if len(nodes) >= 2 and wmin == w20:
            node = Node(nodes[0].name + ',' + nodes[1].name, wmin)
            node.left, node.right = nodes[0], nodes[1]
            nodes[0].parent = nodes[1].parent = node
            nodes.pop(0)
            nodes.pop(0)
            merges.append(node)
        elif len(nodes) >= 1 and len(merges) >= 1 and wmin == w11:
            node = Node(nodes[0].name + ',' + merges[0].name, wmin)
            node.left, node.right = nodes[0], merges[0]
            nodes[0].parent = merges[0].parent = node
            nodes.pop(0)
            merges.pop(0)
            merges.append(node)
        elif len(merges) >= 2 and wmin == w02:
            node = Node(merges[0].name + ',' + merges[1].name, wmin)
            node.left, node.right = merges[0], merges[1]
            merges[0].parent = merges[1].parent = node
            merges.pop(0)
            merges.pop(0)
            merges.append(node)

    root = merges[0]
    codes = {}
    traverse(root, '', codes)
    return codes

def main():
    text = sys.stdin.read()
    chars = {}
    for c in text:
        if c in chars:
            chars[c] += 1
        else:
            chars[c] = 1

    for k, v in sorted(huffman(chars).items(), key=lambda e: len(e[1])):
        print('{:4}: {}'.format(repr(k), v))

if __name__ == '__main__':
    main()