Skip to content

Instantly share code, notes, and snippets.

@ed-asriyan
Created July 16, 2016 12:43
Show Gist options
  • Select an option

  • Save ed-asriyan/8984ecab4bbc4fe73865c995c7a0807a to your computer and use it in GitHub Desktop.

Select an option

Save ed-asriyan/8984ecab4bbc4fe73865c995c7a0807a to your computer and use it in GitHub Desktop.
from sys import argv
from math import log
import fsuniquesearcher
try:
from tabulate import tabulate
except:
print("Module tabulate not installed.")
print("Please install it first. To do this run the following command:")
print("\tpython -m pip install tabulate")
exit()
def size_to_str(size):
size = int(size)
power = int(log(1024, size)) + 1
s = {
0: "",
1: "K",
2: "M",
3: "G",
4: "T"
}[power]
return "{:.3f} {:}B".format(size / 1024 ** power, s)
def print_table(head, body):
print(tabulate(body, headers=head, tablefmt="fancy_grid"))
print('\n')
if __name__ == '__main__':
if len(argv) < 2:
print("Usage: python main.py [directiories...]")
print("Example:\tpython main.py ~/Desktop/ ~/Yandex.Disk/")
exit()
files = [ ]
for path in argv[1:]:
_files = fsuniquesearcher.get_fs_items(path)
files += [ item for item in _files if not (item in files) ]
files_size = sum([ file.get_size() for file in files ])
files_count = len(files)
map_ = fsuniquesearcher.FsUniqueItemsMap(files)
files_groups = map_.get_file_groups()
files_remove = [ ]
for files in files_groups:
for file in files[1:]:
files_remove.append(file)
files_remove_size = sum([ file.get_size() for file in files_remove ])
files_remove_count = len(files_remove)
table_samples = [ ]
for files in files_groups:
for file in files:
table_samples.append([ file.get_path(), size_to_str(file.get_size()), len(files) ])
table_samples.append([ "", "", "" ])
table_remove = [ [ file.get_path(), size_to_str(file.get_size()) ] for file in files_remove ]
table_total = [ [ "Files total:", files_count, "Size total:" , size_to_str(files_size) ],
[ "Remaining files:", files_count - files_remove_count, "Remaining size:", size_to_str(files_size - files_remove_size) ],
[ "Files to remove:", files_remove_count, "Removal size:", size_to_str(files_remove_size) ]
]
print("Groups of identical files:")
print_table([ "File group", "File size", "Group size" ], table_samples)
print("Files which can be removed:")
print_table([ "File", "Size"], table_remove)
print_table([ ], table_total)
if files_remove_size:
print("Choose an action")
print("- delete dublicates (del)")
print("- save groups of equal files (saveeq)")
print("- save dubliates (savedub)")
print("- quit (q)")
ch = input(": ")
if ch == "del":
for file in files_remove:
path = file.get_path()
remove(file.get_path())
print("Removed", path)
if ch == "saveeq":
with open("eq_files.txt", "w") as f:
for files in files_groups:
for file in files:
f.write(file.get_path() + "\n")
f.write("\n")
if ch == "savedub":
with open("dub_files.txt", w) as f:
for file in files_remove_size:
f.write(file.get_path() + "\n")
import os
from hashlib import md5, sha256
# --- Exeption classes ---------------------------------------------
class FsItemNotFoundException(Exception):
def __init__(self, path):
self._path = path
def get_path(self):
return self._path
class InvalidTypeException(Exception):
def __init__(self, real_type, expected_type):
self._real_type = real_type
self._expected_type = expected_type
def get_real_type(self):
return self._real_type
def get_expected_type(self):
return self._expected_type
# --- Hash members ------------------------------------------------
def _md5_sum(fname):
hash_md5 = md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def _sha256_sum(fname):
fhash = sha256()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
fhash.update(chunk)
return fhash.hexdigest()
# --- Fs Classes ---------------------------------------------------
class FsItem:
def __init__(self, path, type="n"):
self._path = path
def get_path(self):
return self._path
def get_name(self):
head, tail = split(self.get_path())
return tail
def __str__(self):
return self.get_name()
class FsFile(FsItem):
def __init__(self, path):
path = str(path)
if not os.path.isfile(path):
raise FsItemNotFoundException(path, "f")
FsItem.__init__(self, path);
self._hash = None
self._size = None
def get_hash(self):
if self._hash is None:
self._hash = _md5_sum(self._path) + _sha256_sum(self._path)
return self._hash
def get_size(self):
if not self._size:
self._size = os.path.getsize(self._path)
return self._size
def __str__(self):
return FsItem.__str__(self)
def __eq__(self, obj):
if obj.get_size() == self.get_size():
return True
return obj.get_hash() == self.get_hash()
# --- Local search members -----------------------------------------
def get_fs_items(dir_path, deep=-1):
dir_path = str(dir_path)
if not os.path.isdir(dir_path):
raise FsItemNotFoundException(dir_path, "d")
items = os.listdir(dir_path)
result = [ ]
for item in items:
item = os.path.join(dir_path, item)
if os.path.isfile(item):
result.append(FsFile(item))
elif deep != 0:
result += get_fs_items(item, deep - 1)
return result
# --- Public members -----------------------------------------------
# --- Analyzing Classes --------------------------------------------
class FsUniqueItemsMap:
def __init__(self, files_list):
self._files = files_list
self._group_list = None
def get_file_groups(self):
if self._group_list is None: # really i dont like big if blocks
size_map = { }
for item in self._files:
if isinstance(item, str):
item = FsFile(item)
if not isinstance(item, FsFile):
raise InvalidTypeException(type(item), FsFile)
f_size = item.get_size()
if f_size in size_map:
size_map[f_size].append(item)
else:
size_map[f_size] = [ item ]
hash_map = { } # not full map
for i_size, items in size_map.items():
if len(items) > 1:
for item in items:
_hash = item.get_hash()
if _hash in hash_map:
hash_map[_hash].append(item)
else:
hash_map[_hash] = [ item ]
self._group_list = [ ]
for i_hash, items in hash_map.items():
if len(items) > 1:
self._group_list.append(items)
return self._group_list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment