PYTHON SCRIPT FOR FINDING AND REMOVING DUPLICATE FILES

Yorum bırakın

20/05/2012 tarafından electrocoder

 

<pre class=”prettyprint”>

import sys, os, cPickle, hashlib, pprint
from datetime import datetime
from collections import defaultdict

class FileInfo():
cache_file = “c:\\temp\\finddupes.cache”
sizes = defaultdict(list)
fingerprints = defaultdict(list)

def __init__(self):
#self.load_cache()
pass

def __del__(self):
#self.save_cache()
pass

def load_cache(self):
sys.stderr.write(“Loading Cache\n”)
try:
cache = file(self.cache_file, “rb”)
self.sizes = cPickle.load( cache )
self.fingerprints = cPickle.load( cache )
except Exception, e:
print “Couldn’t read cache file:” + str(e)

def save_cache(self):
sys.stderr.write(“Saving Cache\n”)
try:
cache = file(self.cache_file, “wb”)
cPickle.dump( self.sizes, cache )
cPickle.dump( self.fingerprints, cache )
except Exception, e:
print “Couldn’t write to cache file:” + str(e)

def get_file_info(self, top):
i = 0
for root, dirs, files in os.walk(top, topdown=False):
for name in files:
if i % 1000 == 0: print i
try:
file = os.path.join(root, name)
size = os.path.getsize(file)
self.sizes[size].append(file)
except Exception, e:
print “Unable to get size of”, file, str(e)
i += 1

def get_same_sizes(self):
dupes = ((i, self.sizes[i]) for i in self.sizes if len(self.sizes[i]) > 1)
for d in dupes:
yield d

def get_fingerprint(self, filename):
f = file(filename, “rb”)
try:
f.seek(-10240,2)
except:
print filename, “: unable to seek”
return 0
return hashlib.sha256(f.read(10240)).hexdigest()

def get_dupes(self):
i = 0
for size, dupes in self.get_same_sizes():
if i % 100 == 0: print i
fprints = defaultdict(list)
for f in dupes:
fp = self.get_fingerprint(f)
fprints[fp].append(f)
self.fingerprints[fp].append(f)
for d in (fprints[i] for i in fprints if len(fprints[i]) > 1):
yield size, d
i += 1

def get_dupes_from_cache(self):
dupes = [self.fingerprints[i] for i in self.fingerprints if len(self.fingerprints[i]) > 1]

def run(top):
fi = FileInfo()
print str(datetime.now()), “Gathering sizes”
fi.get_file_info(top)
print str(datetime.now()), “Finding dupes”
dupes = [d for d in fi.get_dupes()]
dupes.sort()
for s, d in dupes:
if s < 500: continue
for f in d[1:]:
print “Deleting”, f
os.unlink(f)
out = file(“c:\\temp\dupes.txt”, “w”)
pp = pprint.PrettyPrinter(indent=2)
print str(datetime.now()), “Writing output”
out.write(pp.pformat(dupes))
out.close()
print str(datetime.now()), “Done”

if __name__ == “__main__”:
run(sys.argv[1])
</pre>

http://parand.com/say/index.php/2008/10/15/python-script-for-finding-and-removing-duplicate-files/

 

Reklamlar

Bir Cevap Yazın

Aşağıya bilgilerinizi girin veya oturum açmak için bir simgeye tıklayın:

WordPress.com Logosu

WordPress.com hesabınızı kullanarak yorum yapıyorsunuz. Çıkış  Yap / Değiştir )

Twitter resmi

Twitter hesabınızı kullanarak yorum yapıyorsunuz. Çıkış  Yap / Değiştir )

Facebook fotoğrafı

Facebook hesabınızı kullanarak yorum yapıyorsunuz. Çıkış  Yap / Değiştir )

Google+ fotoğrafı

Google+ hesabınızı kullanarak yorum yapıyorsunuz. Çıkış  Yap / Değiştir )

Connecting to %s

%d blogcu bunu beğendi: