Skip to content

Instantly share code, notes, and snippets.

@ik0r
Created January 4, 2014 02:40
Show Gist options
  • Select an option

  • Save ik0r/8250789 to your computer and use it in GitHub Desktop.

Select an option

Save ik0r/8250789 to your computer and use it in GitHub Desktop.
查找某个目录及子目录中的重复文件,速度极快 find.py {dir}
#!/usr/bin/env python
#-*-coding=utf-8-*-
# by oneleaf@ubuntu-cn
import binascii,os,sys
"""
查找某个目录及子目录中的重复文件,速度极快
"""
filesizes={}
samefiles=[]
def filesize(path):
if os.path.isdir(path):
files=os.listdir(path)
for file in files:
filesize(path+"/"+file)
else:
size=os.path.getsize(path)
if not filesizes.has_key(size):
filesizes[size]=[]
filesizes[size].append(path)
def filecrc(files):
filecrcs={}
for file in files:
f=open(file,'r')
crc = binascii.crc32(f.read())
f.close()
if not filecrcs.has_key(crc):
filecrcs[crc]=[]
filecrcs[crc].append(file)
for filecrclist in filecrcs.values():
if len(filecrclist)>1:
samefiles.append(filecrclist)
if __name__ == "__main__":
filesize(sys.argv[1])
for sizesamefilelist in filesizes.values():
if len(sizesamefilelist)>1:
filecrc(sizesamefilelist)
for samefile in samefiles:
print "******* same files group **********"
for file in samefile:
print file
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment