# http://eggdrop.ch/blog/2008/02/17/compare-directories/

import os
import sys
import md5

IGNORE_FILES = ['.DS_Store']
IGNORE_HIDDEN = True

if len(sys.argv) < 3:
    print 'Usage: %s [-d] dir1 dir2' % sys.argv[0]
    print 'Compares files in two directories, based on their MD5 checksum.'
    print '-d: Debug. Prints the MD5 checksum of every file to stderr.'
    sys.exit(1)

if sys.argv[1] == '-d':
    debug = True
    dir1 = sys.argv[2]
    dir2 = sys.argv[3]
else:
    debug = False
    dir1 = sys.argv[1]
    dir2 = sys.argv[2]


# From http://mail.python.org/pipermail/python-list/2005-February/306758.html
def md5file(filename):
    """Return the hex digest of a file without loading it all into memory"""
    fh = open(filename)
    digest = md5.new()
    while 1:
        buf = fh.read(4096)
        if buf == "":
            break
        digest.update(buf)
    fh.close()
    return digest.hexdigest()


def scandir(dir):
    d = {}
    checksums = []
    for root, dirs, files in os.walk(dir):
        files = set(files) - set(IGNORE_FILES)
        for file in files:
            if IGNORE_HIDDEN and file.startswith('.'):
                continue
            name = os.path.join(root, file)
            md5 = md5file(name)
            if debug:
                print >> sys.stderr, md5, name
            checksums.append(md5)
            # Print duplicate files
            if d.get(md5):
                print d[md5]
                print name
                print
            d[md5] = name
    return d, checksums


def printfiles(checksums, d1, d2=None):
    for c in checksums:
        if d2 != None:
            print d1[c]
            print d2[c]
            print
        else:
            print d1[c]


print 'Duplicate files'
print '='*60
print

d1, c1 = scandir(dir1)
d2, c2 = scandir(dir2)

print 'Common files'
print '='*60
print
printfiles(set(c1) & set(c2), d1, d2)
print 'Files only in', dir1
print '='*60
print
printfiles(set(c1) - set(c2), d1)
print
print 'Files only in', dir2
print '='*60
print
printfiles(set(c2) - set(c1), d2)
