Created
January 4, 2014 02:40
-
-
Save ik0r/8250789 to your computer and use it in GitHub Desktop.
查找某个目录及子目录中的重复文件,速度极快 find.py {dir}
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #-*-coding=utf-8-*- | |
| # by oneleaf@ubuntu-cn | |
| import binascii,os,sys | |
| """ | |
| 查找某个目录及子目录中的重复文件,速度极快 | |
| """ | |
| filesizes={} | |
| samefiles=[] | |
| def filesize(path): | |
| if os.path.isdir(path): | |
| files=os.listdir(path) | |
| for file in files: | |
| filesize(path+"/"+file) | |
| else: | |
| size=os.path.getsize(path) | |
| if not filesizes.has_key(size): | |
| filesizes[size]=[] | |
| filesizes[size].append(path) | |
| def filecrc(files): | |
| filecrcs={} | |
| for file in files: | |
| f=open(file,'r') | |
| crc = binascii.crc32(f.read()) | |
| f.close() | |
| if not filecrcs.has_key(crc): | |
| filecrcs[crc]=[] | |
| filecrcs[crc].append(file) | |
| for filecrclist in filecrcs.values(): | |
| if len(filecrclist)>1: | |
| samefiles.append(filecrclist) | |
| if __name__ == "__main__": | |
| filesize(sys.argv[1]) | |
| for sizesamefilelist in filesizes.values(): | |
| if len(sizesamefilelist)>1: | |
| filecrc(sizesamefilelist) | |
| for samefile in samefiles: | |
| print "******* same files group **********" | |
| for file in samefile: | |
| print file |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment