Figuring out largest/smallest/median filesizes

I had to get some statistics about file sizes today, but couldn’t really find a tool for the job, so naturally, I wrote one.

import os, sys, re
from os.path import join, getsize, exists
 
def median(numbers):
    s = sorted(numbers)
    l = len(numbers)
    if l % 2 == 0:
        a, b = s[l / 2 - 1 : l / 2 + 1]
        if a != b:
            return a + b / 2.0
        else:
            return a
    else:
        return s[l / 2]
 
sizes = []
req_re = None
target = '.'
 
if len(sys.argv) > 1:
    target = sys.argv[1]
 
if len(sys.argv) == 3:
    req_re = re.compile(sys.argv[2])
 
for root, dirs, files in os.walk(target):
    for name in files:
        absp = join(root, name)
        if exists(absp):
            if not req_re or req_re.search(absp):
                sizes.append(getsize(absp))
 
num = len(sizes)
total = sum(sizes)
 
print "Num files: %d" % num
print "Average  : %0.2f KB" % ((total / num) / 1024.0)
print "Median   : %0.2f KB" % (median(sizes) / 1024.0)
print "Min      : %0.2f KB" % (min(sizes) / 1024.0)
print "Max      : %0.2f KB" % (max(sizes) / 1024.0)

Usage should be self-explanatory.

Leave a Reply

Your email address will not be published. Required fields are marked *

*

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong> <pre lang="" line="" escaped="" highlight="">