Also added some extra file detection stuff. The the file is zero bytes big it now doesn't bother adding it to the hash list. But it is recorded in the output file. Same for files bigger than the maximum.
#! /usr/bin/env python
''' A python program that walks a given directory to find files that are
duplicated. It then outputs the results to console (simply printing a
dictionary), and an output file.
command line parameters
./directoryhash_1.4.py [root directory] [outputfile] [max file size in bytes]
'''
import os
import sys
import md5
hashes = {} # The "working" hashes dictionary
final = {} # The final dictionary with the all the duplicated files,
zerobytes = [] # with their hashes as keys. A list of files with zero bytes.
toobig = [] # Files that were too big.
rootpath = sys.argv[1]
outputfile = open(sys.argv[2], "w")
maxfile = sys.argv[3]
maxfile = long(maxfile)
def hashfunction(filetohash):
''' Takes a filetohash, hashses it with md5 checksum thingy, then checks to see if
that hash already exists. If not it adds it to a dictionary of files, where their
hash is the key value
'''
try:
openedfile = open(filetohash, "rb")
# print openedfile
filehash = md5.new(openedfile.read()).hexdigest()
# print filehash
if filehash not in hashes:
hashes[filehash] = [filetohash]
else:
hashes[filehash].append(filetohash)
except IOError:
pass
print "\n"
print filetohash
print "Probably a directory. Ignoring"
# The following section walks the directory from the rootpath.
# It then calles the hashing() function to do the checking etc.
for dirpath, directories, files in os.walk(rootpath):
for i in files:
filepath = dirpath + "/" + i
print filepath
try:
if os.path.getsize(filepath) > maxfile:
print filepath + "\n" + "Too big!"
toobig.append(filepath)
elif os.path.getsize(filepath) < maxfile and os.path.getsize(filepath) > 0:
hashfunction(filepath)
elif os.path.getsize(filepath) <= 0:
zerobytes.append(filepath)
except OSError:
# Handles errors with the filenames, usually seems to be because
# of file locking etc. Not sure. Don't care.
print "BORK!"
# Checks the dictionary of hashes and discards all entries where
# there is only one file per hash. (ie the file is unique)
for j in hashes:
if len(hashes[j]) >= 2:
final[j] = hashes[j]
print "\n"
# Takes the final dictionary, and writes the ouput to a text
# file so its useful.
if len(final) > 0:
print "Duplicates found \nCheck output file \n" + "-" * 20
for l in final:
outputfile.write("hash: " + l + "\n")
for i in final[l]:
outputfile.write(i + "\n")
outputfile.write("-" * 20 + "\n\n")
else:
print "No duplicates found! \n" + "-" * 20
outputfile.write("No Duplicates found!\n\n")
if len(zerobytes) > 0:
outputfile.write("Empty files \n" + "-" * 20 + "\n")
for m in zerobytes:
outputfile.write(m + "\n")
outputfile.write("-" * 20 + "\n\n")
if len(toobig) > 0:
outputfile.write("Files bigger than " + str(maxfile) + " bytes" + "\n" + "-" * 20 + "\n")
for m in toobig:
outputfile.write(m + "\n")
outputfile.write("-" * 20 + "\n\n")
outputfile.close()
Enjoy!
[ add comment ] | permalink

Calendar



